#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/precache.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void invalidate_bh_lrus(void);
* that, but not until that's cleaned up.
*/
invalidate_inode_pages(mapping);
+
+ /* 99% of the time, we don't need to flush the precache on the bdev.
+ * But, for the strange corners, lets be cautious
+ */
+ precache_flush_inode(mapping);
}
/*
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
+#include <linux/precache.h>
#include <asm/uaccess.h>
} else {
printk("internal journal\n");
}
+ precache_init(sb);
return res;
}
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
+#include <linux/precache.h>
/*
* I/O completion handler for multipage BIOs.
SetPageMappedToDisk(page);
}
+ if (fully_mapped &&
+ blocks_per_page == 1 && !PageUptodate(page) &&
+ precache_get(page->mapping, page->index, page) == 1) {
+ SetPageUptodate(page);
+ goto confused;
+ }
+
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
#include <linux/parser.h>
#include <linux/crc32.h>
#include <linux/debugfs.h>
+#include <linux/precache.h>
#include <cluster/nodemanager.h>
di = (struct ocfs2_dinode *) bitmap_bh->b_data;
osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
- brelse(bitmap_bh);
mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
(unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
status = ocfs2_init_slot_info(osb);
if (status < 0) {
mlog_errno(status);
- goto bail;
+ goto bail_rel;
}
+ shared_precache_init(sb, &di->id2.i_super.s_uuid[0]);
+bail_rel:
+ brelse(bitmap_bh);
bail:
mlog_exit(status);
return status;
#include <linux/idr.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
+#include <linux/precache.h>
#include <asm/uaccess.h>
s->s_qcop = sb_quotactl_ops;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
+#ifdef CONFIG_PRECACHE
+ s->precache_poolid = -1;
+#endif
}
out:
return s;
DQUOT_OFF(s);
down_write(&s->s_umount);
fs->kill_sb(s);
+ precache_flush_filesystem(s);
put_filesystem(fs);
put_super(s);
}
return error;
}
s->s_flags |= MS_ACTIVE;
+#ifdef CONFIG_PRECACHE
+ s->precache_poolid = -2;
+#endif
return simple_set_mnt(mnt, s);
}
return _hypercall2(int, kexec_op, op, args);
}
+static inline int __must_check
+HYPERVISOR_tmem_op(
+ struct tmem_op *op)
+{
+ return _hypercall1(int, tmem_op, op);
+}
#endif /* __HYPERCALL_H__ */
#include <xen/interface/physdev.h>
#include <xen/interface/sched.h>
#include <xen/interface/nmi.h>
+#include <xen/interface/tmem.h>
#include <asm/ptrace.h>
#include <asm/page.h>
#if defined(__i386__)
return _hypercall2(int, kexec_op, op, args);
}
+static inline int __must_check
+HYPERVISOR_tmem_op(
+ struct tmem_op *op)
+{
+ return _hypercall1(int, tmem_op, op);
+}
+
#endif /* __HYPERCALL_H__ */
/* Granularity of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;
+#ifdef CONFIG_PRECACHE
+ u32 precache_poolid;
+#endif
};
extern struct timespec current_fs_time(struct super_block *sb);
--- /dev/null
+#ifndef _LINUX_PRECACHE_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_PRECACHE
+extern void precache_init(struct super_block *sb);
+extern void shared_precache_init(struct super_block *sb, char *uuid);
+extern int precache_get(struct address_space *mapping, unsigned long index,
+ struct page *empty_page);
+extern int precache_put(struct address_space *mapping, unsigned long index,
+ struct page *page);
+extern int precache_flush(struct address_space *mapping, unsigned long index);
+extern int precache_flush_inode(struct address_space *mapping);
+extern int precache_flush_filesystem(struct super_block *s);
+#else
+static inline void precache_init(struct super_block *sb)
+{
+}
+
+static inline void shared_precache_init(struct super_block *sb, char *uuid)
+{
+}
+
+static inline int precache_get(struct address_space *mapping,
+ unsigned long index, struct page *empty_page)
+{
+ return 0;
+}
+
+static inline int precache_put(struct address_space *mapping,
+ unsigned long index, struct page *page)
+{
+ return 0;
+}
+
+static inline int precache_flush(struct address_space *mapping,
+ unsigned long index)
+{
+ return 0;
+}
+
+static inline int precache_flush_inode(struct address_space *mapping)
+{
+ return 0;
+}
+
+static inline int precache_flush_filesystem(struct super_block *s)
+{
+ return 0;
+}
+#endif
+
+#define _LINUX_PRECACHE_H
+#endif /* _LINUX_PRECACHE_H */
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/sched.h>
+#include <linux/vmalloc.h>
#include <asm/atomic.h>
#include <asm/page.h>
unsigned int pages;
unsigned int max;
unsigned int inuse_pages;
+#ifdef CONFIG_PRESWAP
+ unsigned long *preswap_map;
+ unsigned int preswap_pages;
+#endif
int next; /* next entry on swap list */
};
+#ifdef CONFIG_PRESWAP
+
+#include <linux/sysctl.h>
+extern int preswap_sysctl_handler(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+extern const unsigned long preswap_zero, preswap_infinity;
+
+extern void preswap_shrink(unsigned long);
+extern int preswap_test(struct swap_info_struct *, unsigned long);
+extern void preswap_init(unsigned);
+extern int preswap_put(struct page *);
+extern int preswap_get(struct page *);
+extern void preswap_flush(unsigned, unsigned long);
+extern void preswap_flush_area(unsigned);
+#else
+static inline void preswap_shrink(unsigned long target_pages)
+{
+}
+
+static inline int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+ return 0;
+}
+
+static inline void preswap_init(unsigned type)
+{
+}
+
+static inline int preswap_put(struct page *page)
+{
+ return 0;
+}
+
+static inline int preswap_get(struct page *get)
+{
+ return 0;
+}
+
+static inline void preswap_flush(unsigned type, unsigned long offset)
+{
+}
+
+static inline void preswap_flush_area(unsigned type)
+{
+}
+#endif /* CONFIG_PRESWAP */
+
struct swap_list_t {
int head; /* head of priority-ordered swapfile list */
int next; /* swapfile to be used next */
extern int can_share_swap_page(struct page *);
extern int remove_exclusive_swap_page(struct page *);
struct backing_dev_info;
-
extern spinlock_t swap_lock;
/* linux/mm/thrash.c */
VM_PANIC_ON_OOM=33, /* panic at out-of-memory */
VM_VDSO_ENABLED=34, /* map VDSO into new processes? */
VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */
+ VM_PRESWAP_PAGES=36, /* pages/target_pages in preswap */
};
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
+#endif
+#ifdef CONFIG_PRESWAP
+ {
+ .ctl_name = VM_PRESWAP_PAGES,
+ .procname = "preswap",
+ .data = NULL,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &preswap_sysctl_handler,
+ .extra1 = (void *)&preswap_zero,
+ .extra2 = (void *)&preswap_infinity,
+ },
#endif
{ .ctl_name = 0 }
};
default 64BIT
help
This option allows memory and IO resources to be 64 bit.
+
+#
+# support for transcendent memory
+#
+config TMEM
+ bool
+ depends on XEN
+ help
+ In a virtualized environment, allows unused and underutilized
+ system physical memory to be made accessible through a narrow
+ well-defined page-copy-based API. If unsure, say Y.
+
+config PRECACHE
+ bool "Cache clean pages in transcendent memory"
+ depends on XEN
+ select TMEM
+ help
+ Allows the transcendent memory pool to be used to store clean
+ page-cache pages which, under some circumstances, will greatly
+ reduce paging and thus improve performance. If unsure, say Y.
+
+config PRESWAP
+ bool "Swap pages to transcendent memory"
+ depends on XEN
+ select TMEM
+ help
+ Allows the transcendent memory pool to be used as a pseudo-swap
+ device which, under some circumstances, will greatly reduce
+ swapping and thus improve performance. If unsure, say Y.
+
prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_TMEM) += tmem.o
+obj-$(CONFIG_PRESWAP) += preswap.o
+obj-$(CONFIG_PRECACHE) += precache.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/cpuset.h>
+#include <linux/precache.h>
#include "filemap.h"
#include "internal.h"
{
struct address_space *mapping = page->mapping;
+ /*
+ * if we're uptodate, flush out into the precache, otherwise
+ * invalidate any existing precache entries. We can't leave
+ * stale data around in the precache once our page is gone
+ */
+ if (PageUptodate(page))
+ precache_put(page->mapping, page->index, page);
+ else
+ precache_flush(page->mapping, page->index);
+
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
ret = -ENOMEM;
goto out;
}
+
+ set_page_writeback(page);
+ if (preswap_put(page) == 1) {
+ unlock_page(page);
+ end_page_writeback(page);
+ bio_put(bio);
+ goto out;
+ }
+
if (wbc->sync_mode == WB_SYNC_ALL)
rw |= (1 << BIO_RW_SYNC);
count_vm_event(PSWPOUT);
- set_page_writeback(page);
unlock_page(page);
submit_bio(rw, bio);
out:
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
+
+ if (preswap_get(page) == 1) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
+
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
--- /dev/null
+/*
+ * linux/mm/precache.c
+ *
+ * Implements "precache" for filesystems/pagecache on top of transcendent
+ * memory ("tmem") API. A filesystem creates an "ephemeral tmem pool"
+ * and retains the returned pool_id in its superblock. Clean pages evicted
+ * from pagecache may be "put" into the pool and associated with a "handle"
+ * consisting of the pool_id, an object (inode) id, and an index (page offset).
+ * Note that the page is copied to tmem; no kernel mappings are changed.
+ * If the page is later needed, the filesystem (or VFS) issues a "get", passing
+ * the same handle and an empty pageframe. If successful, the page is copied
+ * into the pageframe and a disk read is avoided. But since the tmem pool
+ * is of indeterminate size, a "put" page has indeterminate longevity
+ * ("ephemeral"), and the "get" may fail, in which case the filesystem must
+ * read the page from disk as before. Note that the filesystem/pagecache are
+ * responsible for maintaining coherency between the pagecache, precache,
+ * and the disk, for which "flush page" and "flush object" actions are
+ * provided. And when a filesystem is unmounted, it must "destroy" the pool.
+ *
+ * Two types of pools may be created for a precache: "private" or "shared".
+ * For a private pool, a successful "get" always flushes, implementing
+ * exclusive semantics; for a "shared" pool (which is intended for use by
+ * co-resident nodes of a cluster filesystem), the "flush" is not guaranteed.
+ * In either case, a failed "duplicate" put (overwrite) always guarantee
+ * the old data is flushed.
+ *
+ * Note also that multiple accesses to a tmem pool may be concurrent and any
+ * ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/precache.h>
+#include <linux/module.h>
+#include "tmem.h"
+
+static int precache_auto_allocate; /* set to 1 to auto_allocate */
+
+int precache_put(struct address_space *mapping, unsigned long index,
+ struct page *page)
+{
+ u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+ u64 obj = (unsigned long) mapping->host->i_ino;
+ u32 ind = (u32) index;
+ unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+ int ret;
+
+ if ((s32)tmem_pool < 0) {
+ if (!precache_auto_allocate)
+ return 0;
+ /* a put on a non-existent precache may auto-allocate one */
+ ret = tmem_new_pool(0, 0, 0);
+ if (ret < 0)
+ return 0;
+ printk(KERN_INFO
+ "Mapping superblock for s_id=%s to precache_id=%d\n",
+ mapping->host->i_sb->s_id, tmem_pool);
+ mapping->host->i_sb->precache_poolid = tmem_pool;
+ }
+ if (ind != index)
+ return 0;
+ mb(); /* ensure page is quiescent; tmem may address it with an alias */
+ return tmem_put_page(tmem_pool, obj, ind, mfn);
+}
+
+int precache_get(struct address_space *mapping, unsigned long index,
+ struct page *empty_page)
+{
+ u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+ u64 obj = (unsigned long) mapping->host->i_ino;
+ u32 ind = (u32) index;
+ unsigned long mfn = pfn_to_mfn(page_to_pfn(empty_page));
+
+ if ((s32)tmem_pool < 0)
+ return 0;
+ if (ind != index)
+ return 0;
+
+ return tmem_get_page(tmem_pool, obj, ind, mfn);
+}
+EXPORT_SYMBOL(precache_get);
+
+int precache_flush(struct address_space *mapping, unsigned long index)
+{
+ u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+ u64 obj = (unsigned long) mapping->host->i_ino;
+ u32 ind = (u32) index;
+
+ if ((s32)tmem_pool < 0)
+ return 0;
+ if (ind != index)
+ return 0;
+
+ return tmem_flush_page(tmem_pool, obj, ind);
+}
+EXPORT_SYMBOL(precache_flush);
+
+int precache_flush_inode(struct address_space *mapping)
+{
+ u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+ u64 obj = (unsigned long) mapping->host->i_ino;
+
+ if ((s32)tmem_pool < 0)
+ return 0;
+
+ return tmem_flush_object(tmem_pool, obj);
+}
+EXPORT_SYMBOL(precache_flush_inode);
+
+int precache_flush_filesystem(struct super_block *sb)
+{
+ u32 tmem_pool = sb->precache_poolid;
+ int ret;
+
+ if ((s32)tmem_pool < 0)
+ return 0;
+ ret = tmem_destroy_pool(tmem_pool);
+ if (!ret)
+ return 0;
+ printk(KERN_INFO
+ "Unmapping superblock for s_id=%s from precache_id=%d\n",
+ sb->s_id, ret);
+ sb->precache_poolid = 0;
+ return 1;
+}
+EXPORT_SYMBOL(precache_flush_filesystem);
+
+void precache_init(struct super_block *sb)
+{
+ sb->precache_poolid = tmem_new_pool(0, 0, 0);
+}
+EXPORT_SYMBOL(precache_init);
+
+void shared_precache_init(struct super_block *sb, char *uuid)
+{
+ u64 uuid_lo = *(u64 *)uuid;
+ u64 uuid_hi = *(u64 *)(&uuid[8]);
+ sb->precache_poolid = tmem_new_pool(uuid_lo, uuid_hi, TMEM_POOL_SHARED);
+}
+EXPORT_SYMBOL(shared_precache_init);
--- /dev/null
+/*
+ * linux/mm/preswap.c
+ *
+ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
+ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
+ * is created along with a bit-per-page preswap_map. When swapping occurs
+ * and a page is about to be written to disk, a "put" into the pool may first
+ * be attempted by passing the pageframe to be swapped, along with a "handle"
+ * consisting of a pool_id, an object id, and an index. Since the pool is of
+ * indeterminate size, the "put" may be rejected, in which case the page
+ * is swapped to disk as normal. If the "put" is successful, the page is
+ * copied to tmem and the preswap_map records the success. Later, when
+ * the page needs to be swapped in, the preswap_map is checked and, if set,
+ * the page may be obtained with a "get" operation. Note that the swap
+ * subsystem is responsible for: maintaining coherency between the swapcache,
+ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
+ * emptying preswap when swapoff is performed. The "flush page" and "flush
+ * object" actions are provided for this.
+ *
+ * Note that if a "duplicate put" is performed to overwrite a page and
+ * the "put" operation fails, the page (and old data) is flushed and lost.
+ * Also note that multiple accesses to a tmem pool may be concurrent and
+ * any ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/uaccess.h>
+#include "tmem.h"
+
+static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
+
+const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS 4
+#define SWIZ_MASK ((1 << SWIZ_BITS) - 1)
+#define oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind) (_ind >> SWIZ_BITS)
+
+/*
+ * preswap_map test/set/clear operations (must be atomic)
+ */
+
+int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+ if (!sis->preswap_map)
+ return 0;
+ return test_bit(offset % BITS_PER_LONG,
+ &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_set(struct swap_info_struct *sis,
+ unsigned long offset)
+{
+ if (!sis->preswap_map)
+ return;
+ set_bit(offset % BITS_PER_LONG,
+ &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_clear(struct swap_info_struct *sis,
+ unsigned long offset)
+{
+ if (!sis->preswap_map)
+ return;
+ clear_bit(offset % BITS_PER_LONG,
+ &sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+/*
+ * preswap tmem operations
+ */
+
+/* returns 1 if the page was successfully put into preswap, 0 if the page
+ * was declined, and -ERRNO for a specific error */
+int preswap_put(struct page *page)
+{
+ swp_entry_t entry = { .val = page_private(page), };
+ unsigned type = swp_type(entry);
+ pgoff_t offset = swp_offset(entry);
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int dup = 0, ret;
+
+ if ((s32)preswap_poolid < 0)
+ return 0;
+ if (ind64 != ind)
+ return 0;
+ if (preswap_test(sis, offset))
+ dup = 1;
+ mb(); /* ensure page is quiescent; tmem may address it with an alias */
+ ret = tmem_put_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
+ if (ret == 1) {
+ preswap_set(sis, offset);
+ if (!dup)
+ sis->preswap_pages++;
+ } else if (dup) {
+ /* failed dup put always results in an automatic flush of
+ * the (older) page from preswap */
+ preswap_clear(sis, offset);
+ sis->preswap_pages--;
+ }
+ return ret;
+}
+
+/* returns 1 if the page was successfully gotten from preswap, 0 if the page
+ * was not present (should never happen!), and -ERRNO for a specific error */
+int preswap_get(struct page *page)
+{
+ swp_entry_t entry = { .val = page_private(page), };
+ unsigned type = swp_type(entry);
+ pgoff_t offset = swp_offset(entry);
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int ret;
+
+ if ((s32)preswap_poolid < 0)
+ return 0;
+ if (ind64 != ind)
+ return 0;
+ if (!preswap_test(sis, offset))
+ return 0;
+ ret = tmem_get_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
+ return ret;
+}
+
+/* flush a single page from preswap */
+void preswap_flush(unsigned type, unsigned long offset)
+{
+ u64 ind64 = (u64)offset;
+ u32 ind = (u32)offset;
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int ret = 1;
+
+ if ((s32)preswap_poolid < 0)
+ return;
+ if (ind64 != ind)
+ return;
+ if (preswap_test(sis, offset)) {
+ ret = tmem_flush_page(preswap_poolid,
+ oswiz(type, ind), iswiz(ind));
+ sis->preswap_pages--;
+ preswap_clear(sis, offset);
+ }
+}
+
+/* flush all pages from the passed swaptype */
+void preswap_flush_area(unsigned type)
+{
+ struct swap_info_struct *sis = get_swap_info_struct(type);
+ int ind;
+
+ if ((s32)preswap_poolid < 0)
+ return;
+ for (ind = SWIZ_MASK; ind >= 0; ind--)
+ (void)tmem_flush_object(preswap_poolid, oswiz(type, ind));
+ sis->preswap_pages = 0;
+}
+
+void preswap_init(unsigned type)
+{
+ /* only need one tmem pool for all swap types */
+ if ((s32)preswap_poolid >= 0)
+ return;
+ preswap_poolid = tmem_new_pool(0, 0, TMEM_POOL_PERSIST);
+ if (preswap_poolid < 0)
+ return;
+}
swap_list.next = p - swap_info;
nr_swap_pages++;
p->inuse_pages--;
+ preswap_flush(p - swap_info, offset);
}
}
return count;
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, unsigned int preswap)
{
unsigned int max = si->max;
unsigned int i = prev;
prev = 0;
i = 1;
}
+ if (preswap) {
+ if (preswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && count != SWAP_MAP_BAD)
break;
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean preswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
*/
-static int try_to_unuse(unsigned int type)
+static int try_to_unuse(unsigned int type, unsigned int preswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct * si = &swap_info[type];
struct mm_struct *start_mm;
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
* interactive performance.
*/
cond_resched();
+ if (preswap && pages_to_unuse && !--pages_to_unuse)
+ break;
}
mmput(start_mm);
spin_unlock(&swap_lock);
current->flags |= PF_SWAPOFF;
- err = try_to_unuse(type);
+ err = try_to_unuse(type, 0, 0);
current->flags &= ~PF_SWAPOFF;
if (err) {
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ preswap_flush_area(p - swap_info);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+#ifdef CONFIG_PRESWAP
+ if (p->preswap_map)
+ vfree(p->preswap_map);
+#endif
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
error = 0;
memset(p->swap_map, 0, maxpages * sizeof(short));
+#ifdef CONFIG_PRESWAP
+ p->preswap_map = vmalloc(maxpages / sizeof(long));
+ if (p->preswap_map)
+ memset(p->preswap_map, 0, maxpages / sizeof(long));
+#endif
for (i = 0; i < swap_header->info.nr_badpages; i++) {
int page_nr = swap_header->info.badpages[i];
if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
} else {
swap_info[prev].next = p - swap_info;
}
+ preswap_init(p - swap_info);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
error = 0;
if (!page_cluster) /* no readahead */
return 0;
+ if (preswap_test(swapdev, swp_offset(entry)))
+ return 0;
toff = (swp_offset(entry) >> page_cluster) << page_cluster;
if (!toff) /* first page is swap header */
toff++, i--;
break;
if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
break;
+ /* Don't read in preswap pages */
+ if (preswap_test(swapdev, toff))
+ break;
toff++;
ret++;
} while (--i);
spin_unlock(&swap_lock);
return ret;
}
+
+#ifdef CONFIG_PRESWAP
+/*
+ * preswap infrastructure functions
+ */
+
+/* code structure leveraged from sys_swapoff */
+void preswap_shrink(unsigned long target_pages)
+{
+ struct swap_info_struct *si = NULL;
+ unsigned long total_pages = 0, total_pages_to_unuse;
+ unsigned long pages = 0, unuse_pages = 0;
+ int type;
+ int wrapped = 0;
+
+ do {
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ total_pages = 0;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = get_swap_info_struct(type);
+ total_pages += si->preswap_pages;
+ }
+ if (total_pages <= target_pages) {
+ spin_unlock(&swap_lock);
+ return;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = get_swap_info_struct(type);
+ if (total_pages_to_unuse < si->preswap_pages)
+ pages = unuse_pages = total_pages_to_unuse;
+ else {
+ pages = si->preswap_pages;
+ unuse_pages = 0; /* unuse all */
+ }
+ if (security_vm_enough_memory(pages))
+ continue;
+ vm_unacct_memory(pages);
+ break;
+ }
+ spin_unlock(&swap_lock);
+ if (type < 0)
+ return;
+ current->flags |= PF_SWAPOFF;
+ (void)try_to_unuse(type, 1, unuse_pages);
+ current->flags &= ~PF_SWAPOFF;
+ wrapped++;
+ } while (wrapped <= 3);
+}
+
+
+#ifdef CONFIG_SYSCTL
+/* cat /sys/proc/vm/preswap provides total number of pages in preswap
+ * across all swaptypes. echo N > /sys/proc/vm/preswap attempts to shrink
+ * preswap page usage to N (usually 0) */
+int preswap_sysctl_handler(ctl_table *table, int write,
+ struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+ unsigned long npages;
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ /* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
+ if (!write) {
+ spin_lock(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = get_swap_info_struct(type);
+ totalpages += si->preswap_pages;
+ }
+ spin_unlock(&swap_lock);
+ npages = totalpages;
+ }
+ table->data = &npages;
+ table->maxlen = sizeof(unsigned long);
+ proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+ if (write)
+ preswap_shrink(npages);
+
+ return 0;
+}
+#endif
+#endif /* CONFIG_PRESWAP */
--- /dev/null
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Dan Magenheimer <dan.magenheimer@oracle.com> 2009
+ */
+
+#include <linux/types.h>
+#include <xen/interface/xen.h>
+#include <asm/hypervisor.h>
+
+int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index,
+ unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+ struct tmem_op op;
+ int rc = 0;
+
+ op.cmd = tmem_cmd;
+ op.pool_id = tmem_pool;
+ op.u.gen.object = object;
+ op.u.gen.index = index;
+ op.u.gen.tmem_offset = tmem_offset;
+ op.u.gen.pfn_offset = pfn_offset;
+ op.u.gen.len = len;
+ op.u.gen.cmfn = gmfn;
+ rc = HYPERVISOR_tmem_op(&op);
+ return rc;
+}
+
+int xen_tmem_new_pool(uint32_t tmem_cmd, uint64_t uuid_lo,
+ uint64_t uuid_hi, uint32_t flags)
+{
+ struct tmem_op op;
+ int rc = 0;
+
+ op.cmd = tmem_cmd;
+ op.u.new.uuid[0] = uuid_lo;
+ op.u.new.uuid[1] = uuid_hi;
+ op.u.new.flags = flags;
+ rc = HYPERVISOR_tmem_op(&op);
+ return rc;
+}
--- /dev/null
+/*
+ * linux/mm/tmem.h
+ *
+ * Interface to transcendent memory, used by mm/precache.c and mm/preswap.c
+ * Currently implemented on XEN, but may be implemented elsewhere in future.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_MIN_PAGESHIFT 12
+#define TMEM_POOL_PAGEORDER (PAGE_SHIFT - TMEM_POOL_MIN_PAGESHIFT)
+
+extern int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index,
+ unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len);
+extern int xen_tmem_new_pool(u32 tmem_cmd, u64 uuid_lo, u64 uuid_hi, u32 flags);
+
+static inline int tmem_put_page(u32 pool_id, u64 object, u32 index,
+ unsigned long gmfn)
+{
+ return xen_tmem_op(TMEM_PUT_PAGE, pool_id, object, index,
+ gmfn, 0, 0, 0);
+}
+
+static inline int tmem_get_page(u32 pool_id, u64 object, u32 index,
+ unsigned long gmfn)
+{
+ return xen_tmem_op(TMEM_GET_PAGE, pool_id, object, index,
+ gmfn, 0, 0, 0);
+}
+
+static inline int tmem_flush_page(u32 pool_id, u64 object, u32 index)
+{
+ return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, object, index,
+ 0, 0, 0, 0);
+}
+
+static inline int tmem_flush_object(u32 pool_id, u64 object)
+{
+ return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, object, 0, 0, 0, 0, 0);
+}
+
+static inline int tmem_new_pool(u64 uuid_lo, u64 uuid_hi, u32 flags)
+{
+ BUILD_BUG_ON((TMEM_POOL_PAGEORDER < 0) ||
+ (TMEM_POOL_PAGEORDER >= TMEM_POOL_PAGESIZE_MASK));
+ flags |= TMEM_POOL_PAGEORDER << TMEM_POOL_PAGESIZE_SHIFT;
+ return xen_tmem_new_pool(TMEM_NEW_POOL, uuid_lo, uuid_hi, flags);
+}
+
+static inline int tmem_destroy_pool(u32 pool_id)
+{
+ return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, 0, 0, 0, 0, 0, 0);
+}
+#else
+struct tmem_op {
+ u32 cmd;
+ s32 pool_id; /* private > 0; shared < 0; 0 is invalid */
+ union {
+ struct { /* for cmd == TMEM_NEW_POOL */
+ u64 uuid[2];
+ u32 flags;
+ } new;
+ struct { /* for cmd == TMEM_CONTROL */
+ u32 subop;
+ u32 cli_id;
+ u32 arg1;
+ u32 arg2;
+ void *buf;
+ } ctrl;
+ struct {
+ u64 object;
+ u32 index;
+ u32 tmem_offset;
+ u32 pfn_offset;
+ u32 len;
+ unsigned long pfn; /* page frame */
+ } gen;
+ } u;
+};
+#endif
#include <linux/module.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
+#include <linux/precache.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
-
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+ precache_flush(page->mapping, page->index);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
remove_from_page_cache(page);
+ /* this must be after the remove_from_page_cache which
+ * calls precache_put
+ */
+ precache_flush(mapping, page->index);
page_cache_release(page); /* pagecache ref */
}
pgoff_t next;
int i;
+ precache_flush_inode(mapping);
if (mapping->nrpages == 0)
return;
}
pagevec_release(&pvec);
}
+ precache_flush_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
int did_range_unmap = 0;
int wrapped = 0;
+ precache_flush_inode(mapping);
pagevec_init(&pvec, 0);
next = start;
while (next <= end && !ret && !wrapped &&
pagevec_release(&pvec);
cond_resched();
}
+ precache_flush_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);