Blame - mm/zswap.c - SHIFTPHONES/mainline/linux

blob: 001474c1a59468afe357bfd8bbd369ab9a54502d [file] [log] [blame]

Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	1	/*
				2	* zswap.c - zswap driver file
				3	*
				4	* zswap is a backend for frontswap that takes pages that are in the process
				5	* of being swapped out and attempts to compress and store them in a
				6	* RAM-based memory pool. This can result in a significant I/O reduction on
				7	* the swap device and, in the case where decompressing from RAM is faster
				8	* than reading from the swap device, can also improve workload performance.
				9	*
				10	* Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
				11	*
				12	* This program is free software; you can redistribute it and/or
				13	* modify it under the terms of the GNU General Public License
				14	* as published by the Free Software Foundation; either version 2
				15	* of the License, or (at your option) any later version.
				16	*
				17	* This program is distributed in the hope that it will be useful,
				18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				20	* GNU General Public License for more details.
				21	*/
				22
				23	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				24
				25	#include <linux/module.h>
				26	#include <linux/cpu.h>
				27	#include <linux/highmem.h>
				28	#include <linux/slab.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/types.h>
				31	#include <linux/atomic.h>
				32	#include <linux/frontswap.h>
				33	#include <linux/rbtree.h>
				34	#include <linux/swap.h>
				35	#include <linux/crypto.h>
				36	#include <linux/mempool.h>
				37	#include <linux/zbud.h>
				38
				39	#include <linux/mm_types.h>
				40	#include <linux/page-flags.h>
				41	#include <linux/swapops.h>
				42	#include <linux/writeback.h>
				43	#include <linux/pagemap.h>
				44
				45	/*********************************
				46	* statistics
				47	**********************************/
				48	/* Number of memory pages used by the compressed pool */
				49	static u64 zswap_pool_pages;
				50	/* The number of compressed pages currently stored in zswap */
				51	static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
				52
				53	/*
				54	* The statistics below are not protected from concurrent access for
				55	* performance reasons so they may not be a 100% accurate. However,
				56	* they do provide useful information on roughly how many times a
				57	* certain event is occurring.
				58	*/
				59
				60	/* Pool limit was hit (see zswap_max_pool_percent) */
				61	static u64 zswap_pool_limit_hit;
				62	/* Pages written back when pool limit was reached */
				63	static u64 zswap_written_back_pages;
				64	/* Store failed due to a reclaim failure after pool limit was reached */
				65	static u64 zswap_reject_reclaim_fail;
				66	/* Compressed page was too big for the allocator to (optimally) store */
				67	static u64 zswap_reject_compress_poor;
				68	/* Store failed because underlying allocator could not get memory */
				69	static u64 zswap_reject_alloc_fail;
				70	/* Store failed because the entry metadata could not be allocated (rare) */
				71	static u64 zswap_reject_kmemcache_fail;
				72	/* Duplicate store was encountered (rare) */
				73	static u64 zswap_duplicate_entry;
				74
				75	/*********************************
				76	* tunables
				77	**********************************/
				78	/* Enable/disable zswap (disabled by default, fixed at boot for now) */
				79	static bool zswap_enabled __read_mostly;
				80	module_param_named(enabled, zswap_enabled, bool, 0);
				81
				82	/* Compressor to be used by zswap (fixed at boot for now) */
				83	#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
				84	static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
				85	module_param_named(compressor, zswap_compressor, charp, 0);
				86
				87	/* The maximum percentage of memory that the compressed pool can occupy */
				88	static unsigned int zswap_max_pool_percent = 20;
				89	module_param_named(max_pool_percent,
				90	zswap_max_pool_percent, uint, 0644);
				91
				92	/*********************************
				93	* compression functions
				94	**********************************/
				95	/* per-cpu compression transforms */
				96	static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
				97
				98	enum comp_op {
				99	ZSWAP_COMPOP_COMPRESS,
				100	ZSWAP_COMPOP_DECOMPRESS
				101	};
				102
				103	static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
				104	u8 dst, unsigned int dlen)
				105	{
				106	struct crypto_comp *tfm;
				107	int ret;
				108
				109	tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
				110	switch (op) {
				111	case ZSWAP_COMPOP_COMPRESS:
				112	ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
				113	break;
				114	case ZSWAP_COMPOP_DECOMPRESS:
				115	ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
				116	break;
				117	default:
				118	ret = -EINVAL;
				119	}
				120
				121	put_cpu();
				122	return ret;
				123	}
				124
				125	static int __init zswap_comp_init(void)
				126	{
				127	if (!crypto_has_comp(zswap_compressor, 0, 0)) {
				128	pr_info("%s compressor not available\n", zswap_compressor);
				129	/* fall back to default compressor */
				130	zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
				131	if (!crypto_has_comp(zswap_compressor, 0, 0))
				132	/* can't even load the default compressor */
				133	return -ENODEV;
				134	}
				135	pr_info("using %s compressor\n", zswap_compressor);
				136
				137	/* alloc percpu transforms */
				138	zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
				139	if (!zswap_comp_pcpu_tfms)
				140	return -ENOMEM;
				141	return 0;
				142	}
				143
				144	static void zswap_comp_exit(void)
				145	{
				146	/* free percpu transforms */
				147	if (zswap_comp_pcpu_tfms)
				148	free_percpu(zswap_comp_pcpu_tfms);
				149	}
				150
				151	/*********************************
				152	* data structures
				153	**********************************/
				154	/*
				155	* struct zswap_entry
				156	*
				157	* This structure contains the metadata for tracking a single compressed
				158	* page within zswap.
				159	*
				160	* rbnode - links the entry into red-black tree for the appropriate swap type
				161	* refcount - the number of outstanding reference to the entry. This is needed
				162	* to protect against premature freeing of the entry by code
				163	* concurent calls to load, invalidate, and writeback. The lock
				164	* for the zswap_tree structure that contains the entry must
				165	* be held while changing the refcount. Since the lock must
				166	* be held, there is no reason to also make refcount atomic.
				167	* offset - the swap offset for the entry. Index into the red-black tree.
				168	* handle - zsmalloc allocation handle that stores the compressed page data
				169	* length - the length in bytes of the compressed page data. Needed during
				170	* decompression
				171	*/
				172	struct zswap_entry {
				173	struct rb_node rbnode;
				174	pgoff_t offset;
				175	int refcount;
				176	unsigned int length;
				177	unsigned long handle;
				178	};
				179
				180	struct zswap_header {
				181	swp_entry_t swpentry;
				182	};
				183
				184	/*
				185	* The tree lock in the zswap_tree struct protects a few things:
				186	* - the rbtree
				187	* - the refcount field of each entry in the tree
				188	*/
				189	struct zswap_tree {
				190	struct rb_root rbroot;
				191	spinlock_t lock;
				192	struct zbud_pool *pool;
				193	};
				194
				195	static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
				196
				197	/*********************************
				198	* zswap entry functions
				199	**********************************/
				200	static struct kmem_cache *zswap_entry_cache;
				201
				202	static int zswap_entry_cache_create(void)
				203	{
				204	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
				205	return (zswap_entry_cache == NULL);
				206	}
				207
				208	static void zswap_entry_cache_destory(void)
				209	{
				210	kmem_cache_destroy(zswap_entry_cache);
				211	}
				212
				213	static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
				214	{
				215	struct zswap_entry *entry;
				216	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
				217	if (!entry)
				218	return NULL;
				219	entry->refcount = 1;
				220	return entry;
				221	}
				222
				223	static void zswap_entry_cache_free(struct zswap_entry *entry)
				224	{
				225	kmem_cache_free(zswap_entry_cache, entry);
				226	}
				227
				228	/* caller must hold the tree lock */
				229	static void zswap_entry_get(struct zswap_entry *entry)
				230	{
				231	entry->refcount++;
				232	}
				233
				234	/* caller must hold the tree lock */
				235	static int zswap_entry_put(struct zswap_entry *entry)
				236	{
				237	entry->refcount--;
				238	return entry->refcount;
				239	}
				240
				241	/*********************************
				242	* rbtree functions
				243	**********************************/
				244	static struct zswap_entry zswap_rb_search(struct rb_root root, pgoff_t offset)
				245	{
				246	struct rb_node *node = root->rb_node;
				247	struct zswap_entry *entry;
				248
				249	while (node) {
				250	entry = rb_entry(node, struct zswap_entry, rbnode);
				251	if (entry->offset > offset)
				252	node = node->rb_left;
				253	else if (entry->offset < offset)
				254	node = node->rb_right;
				255	else
				256	return entry;
				257	}
				258	return NULL;
				259	}
				260
				261	/*
				262	* In the case that a entry with the same offset is found, a pointer to
				263	* the existing entry is stored in dupentry and the function returns -EEXIST
				264	*/
				265	static int zswap_rb_insert(struct rb_root root, struct zswap_entry entry,
				266	struct zswap_entry **dupentry)
				267	{
				268	struct rb_node *link = &root->rb_node, parent = NULL;
				269	struct zswap_entry *myentry;
				270
				271	while (*link) {
				272	parent = *link;
				273	myentry = rb_entry(parent, struct zswap_entry, rbnode);
				274	if (myentry->offset > entry->offset)
				275	link = &(*link)->rb_left;
				276	else if (myentry->offset < entry->offset)
				277	link = &(*link)->rb_right;
				278	else {
				279	*dupentry = myentry;
				280	return -EEXIST;
				281	}
				282	}
				283	rb_link_node(&entry->rbnode, parent, link);
				284	rb_insert_color(&entry->rbnode, root);
				285	return 0;
				286	}
				287
				288	/*********************************
				289	* per-cpu code
				290	**********************************/
				291	static DEFINE_PER_CPU(u8 *, zswap_dstmem);
				292
				293	static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
				294	{
				295	struct crypto_comp *tfm;
				296	u8 *dst;
				297
				298	switch (action) {
				299	case CPU_UP_PREPARE:
				300	tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
				301	if (IS_ERR(tfm)) {
				302	pr_err("can't allocate compressor transform\n");
				303	return NOTIFY_BAD;
				304	}
				305	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
				306	dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
				307	if (!dst) {
				308	pr_err("can't allocate compressor buffer\n");
				309	crypto_free_comp(tfm);
				310	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
				311	return NOTIFY_BAD;
				312	}
				313	per_cpu(zswap_dstmem, cpu) = dst;
				314	break;
				315	case CPU_DEAD:
				316	case CPU_UP_CANCELED:
				317	tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
				318	if (tfm) {
				319	crypto_free_comp(tfm);
				320	*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
				321	}
				322	dst = per_cpu(zswap_dstmem, cpu);
				323	kfree(dst);
				324	per_cpu(zswap_dstmem, cpu) = NULL;
				325	break;
				326	default:
				327	break;
				328	}
				329	return NOTIFY_OK;
				330	}
				331
				332	static int zswap_cpu_notifier(struct notifier_block *nb,
				333	unsigned long action, void *pcpu)
				334	{
				335	unsigned long cpu = (unsigned long)pcpu;
				336	return __zswap_cpu_notifier(action, cpu);
				337	}
				338
				339	static struct notifier_block zswap_cpu_notifier_block = {
				340	.notifier_call = zswap_cpu_notifier
				341	};
				342
				343	static int zswap_cpu_init(void)
				344	{
				345	unsigned long cpu;
				346
				347	get_online_cpus();
				348	for_each_online_cpu(cpu)
				349	if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
				350	goto cleanup;
				351	register_cpu_notifier(&zswap_cpu_notifier_block);
				352	put_online_cpus();
				353	return 0;
				354
				355	cleanup:
				356	for_each_online_cpu(cpu)
				357	__zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
				358	put_online_cpus();
				359	return -ENOMEM;
				360	}
				361
				362	/*********************************
				363	* helpers
				364	**********************************/
				365	static bool zswap_is_full(void)
				366	{
				367	return (totalram_pages * zswap_max_pool_percent / 100 <
				368	zswap_pool_pages);
				369	}
				370
				371	/*
				372	* Carries out the common pattern of freeing and entry's zsmalloc allocation,
				373	* freeing the entry itself, and decrementing the number of stored pages.
				374	*/
				375	static void zswap_free_entry(struct zswap_tree tree, struct zswap_entry entry)
				376	{
				377	zbud_free(tree->pool, entry->handle);
				378	zswap_entry_cache_free(entry);
				379	atomic_dec(&zswap_stored_pages);
				380	zswap_pool_pages = zbud_get_pool_size(tree->pool);
				381	}
				382
				383	/*********************************
				384	* writeback code
				385	**********************************/
				386	/* return enum for zswap_get_swap_cache_page */
				387	enum zswap_get_swap_ret {
				388	ZSWAP_SWAPCACHE_NEW,
				389	ZSWAP_SWAPCACHE_EXIST,
				390	ZSWAP_SWAPCACHE_NOMEM
				391	};
				392
				393	/*
				394	* zswap_get_swap_cache_page
				395	*
				396	* This is an adaption of read_swap_cache_async()
				397	*
				398	* This function tries to find a page with the given swap entry
				399	* in the swapper_space address space (the swap cache). If the page
				400	* is found, it is returned in retpage. Otherwise, a page is allocated,
				401	* added to the swap cache, and returned in retpage.
				402	*
				403	* If success, the swap cache page is returned in retpage
				404	* Returns 0 if page was already in the swap cache, page is not locked
				405	* Returns 1 if the new page needs to be populated, page is locked
				406	* Returns <0 on error
				407	*/
				408	static int zswap_get_swap_cache_page(swp_entry_t entry,
				409	struct page **retpage)
				410	{
				411	struct page found_page, new_page = NULL;
Sunghan Suh	822518d	2013-09-11 14:20:22 -0700	[diff] [blame]	412	struct address_space *swapper_space = swap_address_space(entry);
Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	413	int err;
				414
				415	*retpage = NULL;
				416	do {
				417	/*
				418	* First check the swap cache. Since this is normally
				419	* called after lookup_swap_cache() failed, re-calling
				420	* that would confuse statistics.
				421	*/
				422	found_page = find_get_page(swapper_space, entry.val);
				423	if (found_page)
				424	break;
				425
				426	/*
				427	* Get a new page to read into from swap.
				428	*/
				429	if (!new_page) {
				430	new_page = alloc_page(GFP_KERNEL);
				431	if (!new_page)
				432	break; /* Out of memory */
				433	}
				434
				435	/*
				436	* call radix_tree_preload() while we can wait.
				437	*/
				438	err = radix_tree_preload(GFP_KERNEL);
				439	if (err)
				440	break;
				441
				442	/*
				443	* Swap entry may have been freed since our caller observed it.
				444	*/
				445	err = swapcache_prepare(entry);
				446	if (err == -EEXIST) { /* seems racy */
				447	radix_tree_preload_end();
				448	continue;
				449	}
				450	if (err) { /* swp entry is obsolete ? */
				451	radix_tree_preload_end();
				452	break;
				453	}
				454
				455	/* May fail (-ENOMEM) if radix-tree node allocation failed. */
				456	__set_page_locked(new_page);
				457	SetPageSwapBacked(new_page);
				458	err = __add_to_swap_cache(new_page, entry);
				459	if (likely(!err)) {
				460	radix_tree_preload_end();
				461	lru_cache_add_anon(new_page);
				462	*retpage = new_page;
				463	return ZSWAP_SWAPCACHE_NEW;
				464	}
				465	radix_tree_preload_end();
				466	ClearPageSwapBacked(new_page);
				467	__clear_page_locked(new_page);
				468	/*
				469	* add_to_swap_cache() doesn't return -EEXIST, so we can safely
				470	* clear SWAP_HAS_CACHE flag.
				471	*/
				472	swapcache_free(entry, NULL);
				473	} while (err != -ENOMEM);
				474
				475	if (new_page)
				476	page_cache_release(new_page);
				477	if (!found_page)
				478	return ZSWAP_SWAPCACHE_NOMEM;
				479	*retpage = found_page;
				480	return ZSWAP_SWAPCACHE_EXIST;
				481	}
				482
				483	/*
				484	* Attempts to free an entry by adding a page to the swap cache,
				485	* decompressing the entry data into the page, and issuing a
				486	* bio write to write the page back to the swap device.
				487	*
				488	* This can be thought of as a "resumed writeback" of the page
				489	* to the swap device. We are basically resuming the same swap
				490	* writeback path that was intercepted with the frontswap_store()
				491	* in the first place. After the page has been decompressed into
				492	* the swap cache, the compressed version stored by zswap can be
				493	* freed.
				494	*/
				495	static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
				496	{
				497	struct zswap_header *zhdr;
				498	swp_entry_t swpentry;
				499	struct zswap_tree *tree;
				500	pgoff_t offset;
				501	struct zswap_entry *entry;
				502	struct page *page;
				503	u8 src, dst;
				504	unsigned int dlen;
				505	int ret, refcount;
				506	struct writeback_control wbc = {
				507	.sync_mode = WB_SYNC_NONE,
				508	};
				509
				510	/* extract swpentry from data */
				511	zhdr = zbud_map(pool, handle);
				512	swpentry = zhdr->swpentry; /* here */
				513	zbud_unmap(pool, handle);
				514	tree = zswap_trees[swp_type(swpentry)];
				515	offset = swp_offset(swpentry);
				516	BUG_ON(pool != tree->pool);
				517
				518	/* find and ref zswap entry */
				519	spin_lock(&tree->lock);
				520	entry = zswap_rb_search(&tree->rbroot, offset);
				521	if (!entry) {
				522	/* entry was invalidated */
				523	spin_unlock(&tree->lock);
				524	return 0;
				525	}
				526	zswap_entry_get(entry);
				527	spin_unlock(&tree->lock);
				528	BUG_ON(offset != entry->offset);
				529
				530	/* try to allocate swap cache page */
				531	switch (zswap_get_swap_cache_page(swpentry, &page)) {
				532	case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
				533	ret = -ENOMEM;
				534	goto fail;
				535
				536	case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
				537	/* page is already in the swap cache, ignore for now */
				538	page_cache_release(page);
				539	ret = -EEXIST;
				540	goto fail;
				541
				542	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
				543	/* decompress */
				544	dlen = PAGE_SIZE;
				545	src = (u8 *)zbud_map(tree->pool, entry->handle) +
				546	sizeof(struct zswap_header);
				547	dst = kmap_atomic(page);
				548	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
				549	entry->length, dst, &dlen);
				550	kunmap_atomic(dst);
				551	zbud_unmap(tree->pool, entry->handle);
				552	BUG_ON(ret);
				553	BUG_ON(dlen != PAGE_SIZE);
				554
				555	/* page is up to date */
				556	SetPageUptodate(page);
				557	}
				558
Weijie Yang	b349acc	2013-11-12 15:07:52 -0800	[diff] [blame^]	559	/* move it to the tail of the inactive list after end_writeback */
				560	SetPageReclaim(page);
				561
Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	562	/* start writeback */
				563	__swap_writepage(page, &wbc, end_swap_bio_write);
				564	page_cache_release(page);
				565	zswap_written_back_pages++;
				566
				567	spin_lock(&tree->lock);
				568
				569	/* drop local reference */
				570	zswap_entry_put(entry);
				571	/* drop the initial reference from entry creation */
				572	refcount = zswap_entry_put(entry);
				573
				574	/*
				575	* There are three possible values for refcount here:
				576	* (1) refcount is 1, load is in progress, unlink from rbtree,
				577	* load will free
				578	* (2) refcount is 0, (normal case) entry is valid,
				579	* remove from rbtree and free entry
				580	* (3) refcount is -1, invalidate happened during writeback,
				581	* free entry
				582	*/
				583	if (refcount >= 0) {
				584	/* no invalidate yet, remove from rbtree */
				585	rb_erase(&entry->rbnode, &tree->rbroot);
				586	}
				587	spin_unlock(&tree->lock);
				588	if (refcount <= 0) {
				589	/* free the entry */
				590	zswap_free_entry(tree, entry);
				591	return 0;
				592	}
				593	return -EAGAIN;
				594
				595	fail:
				596	spin_lock(&tree->lock);
				597	zswap_entry_put(entry);
				598	spin_unlock(&tree->lock);
				599	return ret;
				600	}
				601
				602	/*********************************
				603	* frontswap hooks
				604	**********************************/
				605	/* attempts to compress and store an single page */
				606	static int zswap_frontswap_store(unsigned type, pgoff_t offset,
				607	struct page *page)
				608	{
				609	struct zswap_tree *tree = zswap_trees[type];
				610	struct zswap_entry entry, dupentry;
				611	int ret;
				612	unsigned int dlen = PAGE_SIZE, len;
				613	unsigned long handle;
				614	char *buf;
				615	u8 src, dst;
				616	struct zswap_header *zhdr;
				617
				618	if (!tree) {
				619	ret = -ENODEV;
				620	goto reject;
				621	}
				622
				623	/* reclaim space if needed */
				624	if (zswap_is_full()) {
				625	zswap_pool_limit_hit++;
				626	if (zbud_reclaim_page(tree->pool, 8)) {
				627	zswap_reject_reclaim_fail++;
				628	ret = -ENOMEM;
				629	goto reject;
				630	}
				631	}
				632
				633	/* allocate entry */
				634	entry = zswap_entry_cache_alloc(GFP_KERNEL);
				635	if (!entry) {
				636	zswap_reject_kmemcache_fail++;
				637	ret = -ENOMEM;
				638	goto reject;
				639	}
				640
				641	/* compress */
				642	dst = get_cpu_var(zswap_dstmem);
				643	src = kmap_atomic(page);
				644	ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
				645	kunmap_atomic(src);
				646	if (ret) {
				647	ret = -EINVAL;
				648	goto freepage;
				649	}
				650
				651	/* store */
				652	len = dlen + sizeof(struct zswap_header);
				653	ret = zbud_alloc(tree->pool, len, __GFP_NORETRY \| __GFP_NOWARN,
				654	&handle);
				655	if (ret == -ENOSPC) {
				656	zswap_reject_compress_poor++;
				657	goto freepage;
				658	}
				659	if (ret) {
				660	zswap_reject_alloc_fail++;
				661	goto freepage;
				662	}
				663	zhdr = zbud_map(tree->pool, handle);
				664	zhdr->swpentry = swp_entry(type, offset);
				665	buf = (u8 *)(zhdr + 1);
				666	memcpy(buf, dst, dlen);
				667	zbud_unmap(tree->pool, handle);
				668	put_cpu_var(zswap_dstmem);
				669
				670	/* populate entry */
				671	entry->offset = offset;
				672	entry->handle = handle;
				673	entry->length = dlen;
				674
				675	/* map */
				676	spin_lock(&tree->lock);
				677	do {
				678	ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
				679	if (ret == -EEXIST) {
				680	zswap_duplicate_entry++;
				681	/* remove from rbtree */
				682	rb_erase(&dupentry->rbnode, &tree->rbroot);
				683	if (!zswap_entry_put(dupentry)) {
				684	/* free */
				685	zswap_free_entry(tree, dupentry);
				686	}
				687	}
				688	} while (ret == -EEXIST);
				689	spin_unlock(&tree->lock);
				690
				691	/* update stats */
				692	atomic_inc(&zswap_stored_pages);
				693	zswap_pool_pages = zbud_get_pool_size(tree->pool);
				694
				695	return 0;
				696
				697	freepage:
				698	put_cpu_var(zswap_dstmem);
				699	zswap_entry_cache_free(entry);
				700	reject:
				701	return ret;
				702	}
				703
				704	/*
				705	* returns 0 if the page was successfully decompressed
				706	* return -1 on entry not found or error
				707	*/
				708	static int zswap_frontswap_load(unsigned type, pgoff_t offset,
				709	struct page *page)
				710	{
				711	struct zswap_tree *tree = zswap_trees[type];
				712	struct zswap_entry *entry;
				713	u8 src, dst;
				714	unsigned int dlen;
				715	int refcount, ret;
				716
				717	/* find */
				718	spin_lock(&tree->lock);
				719	entry = zswap_rb_search(&tree->rbroot, offset);
				720	if (!entry) {
				721	/* entry was written back */
				722	spin_unlock(&tree->lock);
				723	return -1;
				724	}
				725	zswap_entry_get(entry);
				726	spin_unlock(&tree->lock);
				727
				728	/* decompress */
				729	dlen = PAGE_SIZE;
				730	src = (u8 *)zbud_map(tree->pool, entry->handle) +
				731	sizeof(struct zswap_header);
				732	dst = kmap_atomic(page);
				733	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
				734	dst, &dlen);
				735	kunmap_atomic(dst);
				736	zbud_unmap(tree->pool, entry->handle);
				737	BUG_ON(ret);
				738
				739	spin_lock(&tree->lock);
				740	refcount = zswap_entry_put(entry);
				741	if (likely(refcount)) {
				742	spin_unlock(&tree->lock);
				743	return 0;
				744	}
				745	spin_unlock(&tree->lock);
				746
				747	/*
				748	* We don't have to unlink from the rbtree because
				749	* zswap_writeback_entry() or zswap_frontswap_invalidate page()
				750	* has already done this for us if we are the last reference.
				751	*/
				752	/* free */
				753
				754	zswap_free_entry(tree, entry);
				755
				756	return 0;
				757	}
				758
				759	/* frees an entry in zswap */
				760	static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
				761	{
				762	struct zswap_tree *tree = zswap_trees[type];
				763	struct zswap_entry *entry;
				764	int refcount;
				765
				766	/* find */
				767	spin_lock(&tree->lock);
				768	entry = zswap_rb_search(&tree->rbroot, offset);
				769	if (!entry) {
				770	/* entry was written back */
				771	spin_unlock(&tree->lock);
				772	return;
				773	}
				774
				775	/* remove from rbtree */
				776	rb_erase(&entry->rbnode, &tree->rbroot);
				777
				778	/* drop the initial reference from entry creation */
				779	refcount = zswap_entry_put(entry);
				780
				781	spin_unlock(&tree->lock);
				782
				783	if (refcount) {
				784	/* writeback in progress, writeback will free */
				785	return;
				786	}
				787
				788	/* free */
				789	zswap_free_entry(tree, entry);
				790	}
				791
				792	/* frees all zswap entries for the given swap type */
				793	static void zswap_frontswap_invalidate_area(unsigned type)
				794	{
				795	struct zswap_tree *tree = zswap_trees[type];
Cody P Schafer	0bd4213	2013-09-11 14:25:33 -0700	[diff] [blame]	796	struct zswap_entry entry, n;
Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	797
				798	if (!tree)
				799	return;
				800
				801	/* walk the tree and free everything */
				802	spin_lock(&tree->lock);
Cody P Schafer	0bd4213	2013-09-11 14:25:33 -0700	[diff] [blame]	803	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	804	zbud_free(tree->pool, entry->handle);
				805	zswap_entry_cache_free(entry);
				806	atomic_dec(&zswap_stored_pages);
				807	}
				808	tree->rbroot = RB_ROOT;
				809	spin_unlock(&tree->lock);
Weijie Yang	aa9bca0	2013-10-16 13:46:54 -0700	[diff] [blame]	810
				811	zbud_destroy_pool(tree->pool);
				812	kfree(tree);
				813	zswap_trees[type] = NULL;
Seth Jennings	2b28111	2013-07-10 16:05:03 -0700	[diff] [blame]	814	}
				815
				816	static struct zbud_ops zswap_zbud_ops = {
				817	.evict = zswap_writeback_entry
				818	};
				819
				820	static void zswap_frontswap_init(unsigned type)
				821	{
				822	struct zswap_tree *tree;
				823
				824	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
				825	if (!tree)
				826	goto err;
				827	tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
				828	if (!tree->pool)
				829	goto freetree;
				830	tree->rbroot = RB_ROOT;
				831	spin_lock_init(&tree->lock);
				832	zswap_trees[type] = tree;
				833	return;
				834
				835	freetree:
				836	kfree(tree);
				837	err:
				838	pr_err("alloc failed, zswap disabled for swap type %d\n", type);
				839	}
				840
				841	static struct frontswap_ops zswap_frontswap_ops = {
				842	.store = zswap_frontswap_store,
				843	.load = zswap_frontswap_load,
				844	.invalidate_page = zswap_frontswap_invalidate_page,
				845	.invalidate_area = zswap_frontswap_invalidate_area,
				846	.init = zswap_frontswap_init
				847	};
				848
				849	/*********************************
				850	* debugfs functions
				851	**********************************/
				852	#ifdef CONFIG_DEBUG_FS
				853	#include <linux/debugfs.h>
				854
				855	static struct dentry *zswap_debugfs_root;
				856
				857	static int __init zswap_debugfs_init(void)
				858	{
				859	if (!debugfs_initialized())
				860	return -ENODEV;
				861
				862	zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
				863	if (!zswap_debugfs_root)
				864	return -ENOMEM;
				865
				866	debugfs_create_u64("pool_limit_hit", S_IRUGO,
				867	zswap_debugfs_root, &zswap_pool_limit_hit);
				868	debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
				869	zswap_debugfs_root, &zswap_reject_reclaim_fail);
				870	debugfs_create_u64("reject_alloc_fail", S_IRUGO,
				871	zswap_debugfs_root, &zswap_reject_alloc_fail);
				872	debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
				873	zswap_debugfs_root, &zswap_reject_kmemcache_fail);
				874	debugfs_create_u64("reject_compress_poor", S_IRUGO,
				875	zswap_debugfs_root, &zswap_reject_compress_poor);
				876	debugfs_create_u64("written_back_pages", S_IRUGO,
				877	zswap_debugfs_root, &zswap_written_back_pages);
				878	debugfs_create_u64("duplicate_entry", S_IRUGO,
				879	zswap_debugfs_root, &zswap_duplicate_entry);
				880	debugfs_create_u64("pool_pages", S_IRUGO,
				881	zswap_debugfs_root, &zswap_pool_pages);
				882	debugfs_create_atomic_t("stored_pages", S_IRUGO,
				883	zswap_debugfs_root, &zswap_stored_pages);
				884
				885	return 0;
				886	}
				887
				888	static void __exit zswap_debugfs_exit(void)
				889	{
				890	debugfs_remove_recursive(zswap_debugfs_root);
				891	}
				892	#else
				893	static int __init zswap_debugfs_init(void)
				894	{
				895	return 0;
				896	}
				897
				898	static void __exit zswap_debugfs_exit(void) { }
				899	#endif
				900
				901	/*********************************
				902	* module init and exit
				903	**********************************/
				904	static int __init init_zswap(void)
				905	{
				906	if (!zswap_enabled)
				907	return 0;
				908
				909	pr_info("loading zswap\n");
				910	if (zswap_entry_cache_create()) {
				911	pr_err("entry cache creation failed\n");
				912	goto error;
				913	}
				914	if (zswap_comp_init()) {
				915	pr_err("compressor initialization failed\n");
				916	goto compfail;
				917	}
				918	if (zswap_cpu_init()) {
				919	pr_err("per-cpu initialization failed\n");
				920	goto pcpufail;
				921	}
				922	frontswap_register_ops(&zswap_frontswap_ops);
				923	if (zswap_debugfs_init())
				924	pr_warn("debugfs initialization failed\n");
				925	return 0;
				926	pcpufail:
				927	zswap_comp_exit();
				928	compfail:
				929	zswap_entry_cache_destory();
				930	error:
				931	return -ENOMEM;
				932	}
				933	/* must be late so crypto has time to come up */
				934	late_initcall(init_zswap);
				935
				936	MODULE_LICENSE("GPL");
				937	MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
				938	MODULE_DESCRIPTION("Compressed cache for swap pages");