Blame - mm/swapfile.c - SHIFTPHONES/kernel/shift/mainline

blob: 4b39e9501d44a97e2c7bae6ba2c01bc86f591760 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/swapfile.c
				3	*
				4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				5	* Swap reorganised 29.12.95, Stephen Tweedie
				6	*/
				7
				8	#include <linux/config.h>
				9	#include <linux/mm.h>
				10	#include <linux/hugetlb.h>
				11	#include <linux/mman.h>
				12	#include <linux/slab.h>
				13	#include <linux/kernel_stat.h>
				14	#include <linux/swap.h>
				15	#include <linux/vmalloc.h>
				16	#include <linux/pagemap.h>
				17	#include <linux/namei.h>
				18	#include <linux/shm.h>
				19	#include <linux/blkdev.h>
				20	#include <linux/writeback.h>
				21	#include <linux/proc_fs.h>
				22	#include <linux/seq_file.h>
				23	#include <linux/init.h>
				24	#include <linux/module.h>
				25	#include <linux/rmap.h>
				26	#include <linux/security.h>
				27	#include <linux/backing-dev.h>
				28	#include <linux/syscalls.h>
				29
				30	#include <asm/pgtable.h>
				31	#include <asm/tlbflush.h>
				32	#include <linux/swapops.h>
				33
				34	DEFINE_SPINLOCK(swaplock);
				35	unsigned int nr_swapfiles;
				36	long total_swap_pages;
				37	static int swap_overflow;
				38
				39	EXPORT_SYMBOL(total_swap_pages);
				40
				41	static const char Bad_file[] = "Bad swap file entry ";
				42	static const char Unused_file[] = "Unused swap file entry ";
				43	static const char Bad_offset[] = "Bad swap offset entry ";
				44	static const char Unused_offset[] = "Unused swap offset entry ";
				45
				46	struct swap_list_t swap_list = {-1, -1};
				47
				48	struct swap_info_struct swap_info[MAX_SWAPFILES];
				49
				50	static DECLARE_MUTEX(swapon_sem);
				51
				52	/*
				53	* We need this because the bdev->unplug_fn can sleep and we cannot
				54	* hold swap_list_lock while calling the unplug_fn. And swap_list_lock
				55	* cannot be turned into a semaphore.
				56	*/
				57	static DECLARE_RWSEM(swap_unplug_sem);
				58
				59	#define SWAPFILE_CLUSTER 256
				60
				61	void swap_unplug_io_fn(struct backing_dev_info unused_bdi, struct page page)
				62	{
				63	swp_entry_t entry;
				64
				65	down_read(&swap_unplug_sem);
				66	entry.val = page->private;
				67	if (PageSwapCache(page)) {
				68	struct block_device *bdev = swap_info[swp_type(entry)].bdev;
				69	struct backing_dev_info *bdi;
				70
				71	/*
				72	* If the page is removed from swapcache from under us (with a
				73	* racy try_to_unuse/swapoff) we need an additional reference
				74	* count to avoid reading garbage from page->private above. If
				75	* the WARN_ON triggers during a swapoff it maybe the race
				76	* condition and it's harmless. However if it triggers without
				77	* swapoff it signals a problem.
				78	*/
				79	WARN_ON(page_count(page) <= 1);
				80
				81	bdi = bdev->bd_inode->i_mapping->backing_dev_info;
McMullan, Jason	ba32311	2005-05-16 21:53:40 -0700	[diff] [blame]	82	blk_run_backing_dev(bdi, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	83	}
				84	up_read(&swap_unplug_sem);
				85	}
				86
				87	static inline int scan_swap_map(struct swap_info_struct *si)
				88	{
				89	unsigned long offset;
				90	/*
				91	* We try to cluster swap pages by allocating them
				92	* sequentially in swap. Once we've allocated
				93	* SWAPFILE_CLUSTER pages this way, however, we resort to
				94	* first-free allocation, starting a new cluster. This
				95	* prevents us from scattering swap pages all over the entire
				96	* swap partition, so that we reduce overall disk seek times
				97	* between swap pages. -- sct */
				98	if (si->cluster_nr) {
				99	while (si->cluster_next <= si->highest_bit) {
				100	offset = si->cluster_next++;
				101	if (si->swap_map[offset])
				102	continue;
				103	si->cluster_nr--;
				104	goto got_page;
				105	}
				106	}
				107	si->cluster_nr = SWAPFILE_CLUSTER;
				108
				109	/* try to find an empty (even not aligned) cluster. */
				110	offset = si->lowest_bit;
				111	check_next_cluster:
				112	if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit)
				113	{
				114	unsigned long nr;
				115	for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++)
				116	if (si->swap_map[nr])
				117	{
				118	offset = nr+1;
				119	goto check_next_cluster;
				120	}
				121	/* We found a completly empty cluster, so start
				122	* using it.
				123	*/
				124	goto got_page;
				125	}
				126	/* No luck, so now go finegrined as usual. -Andrea */
				127	for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) {
				128	if (si->swap_map[offset])
				129	continue;
				130	si->lowest_bit = offset+1;
				131	got_page:
				132	if (offset == si->lowest_bit)
				133	si->lowest_bit++;
				134	if (offset == si->highest_bit)
				135	si->highest_bit--;
				136	if (si->lowest_bit > si->highest_bit) {
				137	si->lowest_bit = si->max;
				138	si->highest_bit = 0;
				139	}
				140	si->swap_map[offset] = 1;
				141	si->inuse_pages++;
				142	nr_swap_pages--;
				143	si->cluster_next = offset+1;
				144	return offset;
				145	}
				146	si->lowest_bit = si->max;
				147	si->highest_bit = 0;
				148	return 0;
				149	}
				150
				151	swp_entry_t get_swap_page(void)
				152	{
				153	struct swap_info_struct * p;
				154	unsigned long offset;
				155	swp_entry_t entry;
				156	int type, wrapped = 0;
				157
				158	entry.val = 0; /* Out of memory */
				159	swap_list_lock();
				160	type = swap_list.next;
				161	if (type < 0)
				162	goto out;
				163	if (nr_swap_pages <= 0)
				164	goto out;
				165
				166	while (1) {
				167	p = &swap_info[type];
				168	if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
				169	swap_device_lock(p);
				170	offset = scan_swap_map(p);
				171	swap_device_unlock(p);
				172	if (offset) {
				173	entry = swp_entry(type,offset);
				174	type = swap_info[type].next;
				175	if (type < 0 \|\|
				176	p->prio != swap_info[type].prio) {
				177	swap_list.next = swap_list.head;
				178	} else {
				179	swap_list.next = type;
				180	}
				181	goto out;
				182	}
				183	}
				184	type = p->next;
				185	if (!wrapped) {
				186	if (type < 0 \|\| p->prio != swap_info[type].prio) {
				187	type = swap_list.head;
				188	wrapped = 1;
				189	}
				190	} else
				191	if (type < 0)
				192	goto out; /* out of swap space */
				193	}
				194	out:
				195	swap_list_unlock();
				196	return entry;
				197	}
				198
				199	static struct swap_info_struct * swap_info_get(swp_entry_t entry)
				200	{
				201	struct swap_info_struct * p;
				202	unsigned long offset, type;
				203
				204	if (!entry.val)
				205	goto out;
				206	type = swp_type(entry);
				207	if (type >= nr_swapfiles)
				208	goto bad_nofile;
				209	p = & swap_info[type];
				210	if (!(p->flags & SWP_USED))
				211	goto bad_device;
				212	offset = swp_offset(entry);
				213	if (offset >= p->max)
				214	goto bad_offset;
				215	if (!p->swap_map[offset])
				216	goto bad_free;
				217	swap_list_lock();
				218	if (p->prio > swap_info[swap_list.next].prio)
				219	swap_list.next = type;
				220	swap_device_lock(p);
				221	return p;
				222
				223	bad_free:
				224	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
				225	goto out;
				226	bad_offset:
				227	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
				228	goto out;
				229	bad_device:
				230	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
				231	goto out;
				232	bad_nofile:
				233	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
				234	out:
				235	return NULL;
				236	}
				237
				238	static void swap_info_put(struct swap_info_struct * p)
				239	{
				240	swap_device_unlock(p);
				241	swap_list_unlock();
				242	}
				243
				244	static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
				245	{
				246	int count = p->swap_map[offset];
				247
				248	if (count < SWAP_MAP_MAX) {
				249	count--;
				250	p->swap_map[offset] = count;
				251	if (!count) {
				252	if (offset < p->lowest_bit)
				253	p->lowest_bit = offset;
				254	if (offset > p->highest_bit)
				255	p->highest_bit = offset;
				256	nr_swap_pages++;
				257	p->inuse_pages--;
				258	}
				259	}
				260	return count;
				261	}
				262
				263	/*
				264	* Caller has made sure that the swapdevice corresponding to entry
				265	* is still around or has not been recycled.
				266	*/
				267	void swap_free(swp_entry_t entry)
				268	{
				269	struct swap_info_struct * p;
				270
				271	p = swap_info_get(entry);
				272	if (p) {
				273	swap_entry_free(p, swp_offset(entry));
				274	swap_info_put(p);
				275	}
				276	}
				277
				278	/*
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	279	* How many references to page are currently swapped out?
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	280	*/
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	281	static inline int page_swapcount(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	282	{
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	283	int count = 0;
				284	struct swap_info_struct *p;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	285	swp_entry_t entry;
				286
				287	entry.val = page->private;
				288	p = swap_info_get(entry);
				289	if (p) {
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	290	/* Subtract the 1 for the swap cache itself */
				291	count = p->swap_map[swp_offset(entry)] - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	292	swap_info_put(p);
				293	}
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	294	return count;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	295	}
				296
				297	/*
				298	* We can use this swap cache entry directly
				299	* if there are no other references to it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	300	*/
				301	int can_share_swap_page(struct page *page)
				302	{
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	303	int count;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	304
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	305	BUG_ON(!PageLocked(page));
				306	count = page_mapcount(page);
				307	if (count <= 1 && PageSwapCache(page))
				308	count += page_swapcount(page);
				309	return count == 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	310	}
				311
				312	/*
				313	* Work out if there are any other processes sharing this
				314	* swap cache page. Free it if you can. Return success.
				315	*/
				316	int remove_exclusive_swap_page(struct page *page)
				317	{
				318	int retval;
				319	struct swap_info_struct * p;
				320	swp_entry_t entry;
				321
				322	BUG_ON(PagePrivate(page));
				323	BUG_ON(!PageLocked(page));
				324
				325	if (!PageSwapCache(page))
				326	return 0;
				327	if (PageWriteback(page))
				328	return 0;
				329	if (page_count(page) != 2) /* 2: us + cache */
				330	return 0;
				331
				332	entry.val = page->private;
				333	p = swap_info_get(entry);
				334	if (!p)
				335	return 0;
				336
				337	/* Is the only swap cache user the cache itself? */
				338	retval = 0;
				339	if (p->swap_map[swp_offset(entry)] == 1) {
				340	/* Recheck the page count with the swapcache lock held.. */
				341	write_lock_irq(&swapper_space.tree_lock);
				342	if ((page_count(page) == 2) && !PageWriteback(page)) {
				343	__delete_from_swap_cache(page);
				344	SetPageDirty(page);
				345	retval = 1;
				346	}
				347	write_unlock_irq(&swapper_space.tree_lock);
				348	}
				349	swap_info_put(p);
				350
				351	if (retval) {
				352	swap_free(entry);
				353	page_cache_release(page);
				354	}
				355
				356	return retval;
				357	}
				358
				359	/*
				360	* Free the swap entry like above, but also try to
				361	* free the page cache entry if it is the last user.
				362	*/
				363	void free_swap_and_cache(swp_entry_t entry)
				364	{
				365	struct swap_info_struct * p;
				366	struct page *page = NULL;
				367
				368	p = swap_info_get(entry);
				369	if (p) {
				370	if (swap_entry_free(p, swp_offset(entry)) == 1)
				371	page = find_trylock_page(&swapper_space, entry.val);
				372	swap_info_put(p);
				373	}
				374	if (page) {
				375	int one_user;
				376
				377	BUG_ON(PagePrivate(page));
				378	page_cache_get(page);
				379	one_user = (page_count(page) == 2);
				380	/* Only cache user (+us), or swap space full? Free it! */
				381	if (!PageWriteback(page) && (one_user \|\| vm_swap_full())) {
				382	delete_from_swap_cache(page);
				383	SetPageDirty(page);
				384	}
				385	unlock_page(page);
				386	page_cache_release(page);
				387	}
				388	}
				389
				390	/*
				391	* Always set the resulting pte to be nowrite (the same as COW pages
				392	* after one process has exited). We don't know just how many PTEs will
				393	* share this swap entry, so be cautious and let do_wp_page work out
				394	* what to do if a write is requested later.
				395	*
				396	* vma->vm_mm->page_table_lock is held.
				397	*/
				398	static void unuse_pte(struct vm_area_struct vma, pte_t pte,
				399	unsigned long addr, swp_entry_t entry, struct page *page)
				400	{
				401	inc_mm_counter(vma->vm_mm, rss);
				402	get_page(page);
				403	set_pte_at(vma->vm_mm, addr, pte,
				404	pte_mkold(mk_pte(page, vma->vm_page_prot)));
				405	page_add_anon_rmap(page, vma, addr);
				406	swap_free(entry);
				407	/*
				408	* Move the page to the active list so it is not
				409	* immediately swapped out again after swapon.
				410	*/
				411	activate_page(page);
				412	}
				413
				414	static int unuse_pte_range(struct vm_area_struct vma, pmd_t pmd,
				415	unsigned long addr, unsigned long end,
				416	swp_entry_t entry, struct page *page)
				417	{
				418	pte_t *pte;
				419	pte_t swp_pte = swp_entry_to_pte(entry);
				420
				421	pte = pte_offset_map(pmd, addr);
				422	do {
				423	/*
				424	* swapoff spends a _lot_ of time in this loop!
				425	* Test inline before going to call unuse_pte.
				426	*/
				427	if (unlikely(pte_same(*pte, swp_pte))) {
				428	unuse_pte(vma, pte, addr, entry, page);
				429	pte_unmap(pte);
				430	return 1;
				431	}
				432	} while (pte++, addr += PAGE_SIZE, addr != end);
				433	pte_unmap(pte - 1);
				434	return 0;
				435	}
				436
				437	static inline int unuse_pmd_range(struct vm_area_struct vma, pud_t pud,
				438	unsigned long addr, unsigned long end,
				439	swp_entry_t entry, struct page *page)
				440	{
				441	pmd_t *pmd;
				442	unsigned long next;
				443
				444	pmd = pmd_offset(pud, addr);
				445	do {
				446	next = pmd_addr_end(addr, end);
				447	if (pmd_none_or_clear_bad(pmd))
				448	continue;
				449	if (unuse_pte_range(vma, pmd, addr, next, entry, page))
				450	return 1;
				451	} while (pmd++, addr = next, addr != end);
				452	return 0;
				453	}
				454
				455	static inline int unuse_pud_range(struct vm_area_struct vma, pgd_t pgd,
				456	unsigned long addr, unsigned long end,
				457	swp_entry_t entry, struct page *page)
				458	{
				459	pud_t *pud;
				460	unsigned long next;
				461
				462	pud = pud_offset(pgd, addr);
				463	do {
				464	next = pud_addr_end(addr, end);
				465	if (pud_none_or_clear_bad(pud))
				466	continue;
				467	if (unuse_pmd_range(vma, pud, addr, next, entry, page))
				468	return 1;
				469	} while (pud++, addr = next, addr != end);
				470	return 0;
				471	}
				472
				473	static int unuse_vma(struct vm_area_struct *vma,
				474	swp_entry_t entry, struct page *page)
				475	{
				476	pgd_t *pgd;
				477	unsigned long addr, end, next;
				478
				479	if (page->mapping) {
				480	addr = page_address_in_vma(page, vma);
				481	if (addr == -EFAULT)
				482	return 0;
				483	else
				484	end = addr + PAGE_SIZE;
				485	} else {
				486	addr = vma->vm_start;
				487	end = vma->vm_end;
				488	}
				489
				490	pgd = pgd_offset(vma->vm_mm, addr);
				491	do {
				492	next = pgd_addr_end(addr, end);
				493	if (pgd_none_or_clear_bad(pgd))
				494	continue;
				495	if (unuse_pud_range(vma, pgd, addr, next, entry, page))
				496	return 1;
				497	} while (pgd++, addr = next, addr != end);
				498	return 0;
				499	}
				500
				501	static int unuse_mm(struct mm_struct *mm,
				502	swp_entry_t entry, struct page *page)
				503	{
				504	struct vm_area_struct *vma;
				505
				506	if (!down_read_trylock(&mm->mmap_sem)) {
				507	/*
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	508	* Activate page so shrink_cache is unlikely to unmap its
				509	* ptes while lock is dropped, so swapoff can make progress.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	510	*/
Hugh Dickins	c475a8a	2005-06-21 17:15:12 -0700	[diff] [blame]	511	activate_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	512	unlock_page(page);
				513	down_read(&mm->mmap_sem);
				514	lock_page(page);
				515	}
				516	spin_lock(&mm->page_table_lock);
				517	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				518	if (vma->anon_vma && unuse_vma(vma, entry, page))
				519	break;
				520	}
				521	spin_unlock(&mm->page_table_lock);
				522	up_read(&mm->mmap_sem);
				523	/*
				524	* Currently unuse_mm cannot fail, but leave error handling
				525	* at call sites for now, since we change it from time to time.
				526	*/
				527	return 0;
				528	}
				529
				530	/*
				531	* Scan swap_map from current position to next entry still in use.
				532	* Recycle to start on reaching the end, returning 0 when empty.
				533	*/
				534	static int find_next_to_unuse(struct swap_info_struct *si, int prev)
				535	{
				536	int max = si->max;
				537	int i = prev;
				538	int count;
				539
				540	/*
				541	* No need for swap_device_lock(si) here: we're just looking
				542	* for whether an entry is in use, not modifying it; false
				543	* hits are okay, and sys_swapoff() has already prevented new
				544	* allocations from this area (while holding swap_list_lock()).
				545	*/
				546	for (;;) {
				547	if (++i >= max) {
				548	if (!prev) {
				549	i = 0;
				550	break;
				551	}
				552	/*
				553	* No entries in use at top of swap_map,
				554	* loop back to start and recheck there.
				555	*/
				556	max = prev + 1;
				557	prev = 0;
				558	i = 1;
				559	}
				560	count = si->swap_map[i];
				561	if (count && count != SWAP_MAP_BAD)
				562	break;
				563	}
				564	return i;
				565	}
				566
				567	/*
				568	* We completely avoid races by reading each swap page in advance,
				569	* and then search for the process using it. All the necessary
				570	* page table adjustments can then be made atomically.
				571	*/
				572	static int try_to_unuse(unsigned int type)
				573	{
				574	struct swap_info_struct * si = &swap_info[type];
				575	struct mm_struct *start_mm;
				576	unsigned short *swap_map;
				577	unsigned short swcount;
				578	struct page *page;
				579	swp_entry_t entry;
				580	int i = 0;
				581	int retval = 0;
				582	int reset_overflow = 0;
				583	int shmem;
				584
				585	/*
				586	* When searching mms for an entry, a good strategy is to
				587	* start at the first mm we freed the previous entry from
				588	* (though actually we don't notice whether we or coincidence
				589	* freed the entry). Initialize this start_mm with a hold.
				590	*
				591	* A simpler strategy would be to start at the last mm we
				592	* freed the previous entry from; but that would take less
				593	* advantage of mmlist ordering, which clusters forked mms
				594	* together, child after parent. If we race with dup_mmap(), we
				595	* prefer to resolve parent before child, lest we miss entries
				596	* duplicated after we scanned child: using last mm would invert
				597	* that. Though it's only a serious concern when an overflowed
				598	* swap count is reset from SWAP_MAP_MAX, preventing a rescan.
				599	*/
				600	start_mm = &init_mm;
				601	atomic_inc(&init_mm.mm_users);
				602
				603	/*
				604	* Keep on scanning until all entries have gone. Usually,
				605	* one pass through swap_map is enough, but not necessarily:
				606	* there are races when an instance of an entry might be missed.
				607	*/
				608	while ((i = find_next_to_unuse(si, i)) != 0) {
				609	if (signal_pending(current)) {
				610	retval = -EINTR;
				611	break;
				612	}
				613
				614	/*
				615	* Get a page for the entry, using the existing swap
				616	* cache page if there is one. Otherwise, get a clean
				617	* page and read the swap into it.
				618	*/
				619	swap_map = &si->swap_map[i];
				620	entry = swp_entry(type, i);
				621	page = read_swap_cache_async(entry, NULL, 0);
				622	if (!page) {
				623	/*
				624	* Either swap_duplicate() failed because entry
				625	* has been freed independently, and will not be
				626	* reused since sys_swapoff() already disabled
				627	* allocation from here, or alloc_page() failed.
				628	*/
				629	if (!*swap_map)
				630	continue;
				631	retval = -ENOMEM;
				632	break;
				633	}
				634
				635	/*
				636	* Don't hold on to start_mm if it looks like exiting.
				637	*/
				638	if (atomic_read(&start_mm->mm_users) == 1) {
				639	mmput(start_mm);
				640	start_mm = &init_mm;
				641	atomic_inc(&init_mm.mm_users);
				642	}
				643
				644	/*
				645	* Wait for and lock page. When do_swap_page races with
				646	* try_to_unuse, do_swap_page can handle the fault much
				647	* faster than try_to_unuse can locate the entry. This
				648	* apparently redundant "wait_on_page_locked" lets try_to_unuse
				649	* defer to do_swap_page in such a case - in some tests,
				650	* do_swap_page and try_to_unuse repeatedly compete.
				651	*/
				652	wait_on_page_locked(page);
				653	wait_on_page_writeback(page);
				654	lock_page(page);
				655	wait_on_page_writeback(page);
				656
				657	/*
				658	* Remove all references to entry.
				659	* Whenever we reach init_mm, there's no address space
				660	* to search, but use it as a reminder to search shmem.
				661	*/
				662	shmem = 0;
				663	swcount = *swap_map;
				664	if (swcount > 1) {
				665	if (start_mm == &init_mm)
				666	shmem = shmem_unuse(entry, page);
				667	else
				668	retval = unuse_mm(start_mm, entry, page);
				669	}
				670	if (*swap_map > 1) {
				671	int set_start_mm = (*swap_map >= swcount);
				672	struct list_head *p = &start_mm->mmlist;
				673	struct mm_struct *new_start_mm = start_mm;
				674	struct mm_struct *prev_mm = start_mm;
				675	struct mm_struct *mm;
				676
				677	atomic_inc(&new_start_mm->mm_users);
				678	atomic_inc(&prev_mm->mm_users);
				679	spin_lock(&mmlist_lock);
				680	while (*swap_map > 1 && !retval &&
				681	(p = p->next) != &start_mm->mmlist) {
				682	mm = list_entry(p, struct mm_struct, mmlist);
				683	if (atomic_inc_return(&mm->mm_users) == 1) {
				684	atomic_dec(&mm->mm_users);
				685	continue;
				686	}
				687	spin_unlock(&mmlist_lock);
				688	mmput(prev_mm);
				689	prev_mm = mm;
				690
				691	cond_resched();
				692
				693	swcount = *swap_map;
				694	if (swcount <= 1)
				695	;
				696	else if (mm == &init_mm) {
				697	set_start_mm = 1;
				698	shmem = shmem_unuse(entry, page);
				699	} else
				700	retval = unuse_mm(mm, entry, page);
				701	if (set_start_mm && *swap_map < swcount) {
				702	mmput(new_start_mm);
				703	atomic_inc(&mm->mm_users);
				704	new_start_mm = mm;
				705	set_start_mm = 0;
				706	}
				707	spin_lock(&mmlist_lock);
				708	}
				709	spin_unlock(&mmlist_lock);
				710	mmput(prev_mm);
				711	mmput(start_mm);
				712	start_mm = new_start_mm;
				713	}
				714	if (retval) {
				715	unlock_page(page);
				716	page_cache_release(page);
				717	break;
				718	}
				719
				720	/*
				721	* How could swap count reach 0x7fff when the maximum
				722	* pid is 0x7fff, and there's no way to repeat a swap
				723	* page within an mm (except in shmem, where it's the
				724	* shared object which takes the reference count)?
				725	* We believe SWAP_MAP_MAX cannot occur in Linux 2.4.
				726	*
				727	* If that's wrong, then we should worry more about
				728	* exit_mmap() and do_munmap() cases described above:
				729	* we might be resetting SWAP_MAP_MAX too early here.
				730	* We know "Undead"s can happen, they're okay, so don't
				731	* report them; but do report if we reset SWAP_MAP_MAX.
				732	*/
				733	if (*swap_map == SWAP_MAP_MAX) {
				734	swap_device_lock(si);
				735	*swap_map = 1;
				736	swap_device_unlock(si);
				737	reset_overflow = 1;
				738	}
				739
				740	/*
				741	* If a reference remains (rare), we would like to leave
				742	* the page in the swap cache; but try_to_unmap could
				743	* then re-duplicate the entry once we drop page lock,
				744	* so we might loop indefinitely; also, that page could
				745	* not be swapped out to other storage meanwhile. So:
				746	* delete from cache even if there's another reference,
				747	* after ensuring that the data has been saved to disk -
				748	* since if the reference remains (rarer), it will be
				749	* read from disk into another page. Splitting into two
				750	* pages would be incorrect if swap supported "shared
				751	* private" pages, but they are handled by tmpfs files.
				752	*
				753	* Note shmem_unuse already deleted a swappage from
				754	* the swap cache, unless the move to filepage failed:
				755	* in which case it left swappage in cache, lowered its
				756	* swap count to pass quickly through the loops above,
				757	* and now we must reincrement count to try again later.
				758	*/
				759	if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
				760	struct writeback_control wbc = {
				761	.sync_mode = WB_SYNC_NONE,
				762	};
				763
				764	swap_writepage(page, &wbc);
				765	lock_page(page);
				766	wait_on_page_writeback(page);
				767	}
				768	if (PageSwapCache(page)) {
				769	if (shmem)
				770	swap_duplicate(entry);
				771	else
				772	delete_from_swap_cache(page);
				773	}
				774
				775	/*
				776	* So we could skip searching mms once swap count went
				777	* to 1, we did not mark any present ptes as dirty: must
				778	* mark page dirty so shrink_list will preserve it.
				779	*/
				780	SetPageDirty(page);
				781	unlock_page(page);
				782	page_cache_release(page);
				783
				784	/*
				785	* Make sure that we aren't completely killing
				786	* interactive performance.
				787	*/
				788	cond_resched();
				789	}
				790
				791	mmput(start_mm);
				792	if (reset_overflow) {
				793	printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
				794	swap_overflow = 0;
				795	}
				796	return retval;
				797	}
				798
				799	/*
				800	* After a successful try_to_unuse, if no swap is now in use, we know we
				801	* can empty the mmlist. swap_list_lock must be held on entry and exit.
				802	* Note that mmlist_lock nests inside swap_list_lock, and an mm must be
				803	* added to the mmlist just after page_duplicate - before would be racy.
				804	*/
				805	static void drain_mmlist(void)
				806	{
				807	struct list_head p, next;
				808	unsigned int i;
				809
				810	for (i = 0; i < nr_swapfiles; i++)
				811	if (swap_info[i].inuse_pages)
				812	return;
				813	spin_lock(&mmlist_lock);
				814	list_for_each_safe(p, next, &init_mm.mmlist)
				815	list_del_init(p);
				816	spin_unlock(&mmlist_lock);
				817	}
				818
				819	/*
				820	* Use this swapdev's extent info to locate the (PAGE_SIZE) block which
				821	* corresponds to page offset `offset'.
				822	*/
				823	sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
				824	{
				825	struct swap_extent *se = sis->curr_swap_extent;
				826	struct swap_extent *start_se = se;
				827
				828	for ( ; ; ) {
				829	struct list_head *lh;
				830
				831	if (se->start_page <= offset &&
				832	offset < (se->start_page + se->nr_pages)) {
				833	return se->start_block + (offset - se->start_page);
				834	}
				835	lh = se->list.prev;
				836	if (lh == &sis->extent_list)
				837	lh = lh->prev;
				838	se = list_entry(lh, struct swap_extent, list);
				839	sis->curr_swap_extent = se;
				840	BUG_ON(se == start_se); /* It must be present */
				841	}
				842	}
				843
				844	/*
				845	* Free all of a swapdev's extent information
				846	*/
				847	static void destroy_swap_extents(struct swap_info_struct *sis)
				848	{
				849	while (!list_empty(&sis->extent_list)) {
				850	struct swap_extent *se;
				851
				852	se = list_entry(sis->extent_list.next,
				853	struct swap_extent, list);
				854	list_del(&se->list);
				855	kfree(se);
				856	}
				857	sis->nr_extents = 0;
				858	}
				859
				860	/*
				861	* Add a block range (and the corresponding page range) into this swapdev's
				862	* extent list. The extent list is kept sorted in block order.
				863	*
				864	* This function rather assumes that it is called in ascending sector_t order.
				865	* It doesn't look for extent coalescing opportunities.
				866	*/
				867	static int
				868	add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
				869	unsigned long nr_pages, sector_t start_block)
				870	{
				871	struct swap_extent *se;
				872	struct swap_extent *new_se;
				873	struct list_head *lh;
				874
				875	lh = sis->extent_list.next; /* The highest-addressed block */
				876	while (lh != &sis->extent_list) {
				877	se = list_entry(lh, struct swap_extent, list);
				878	if (se->start_block + se->nr_pages == start_block &&
				879	se->start_page + se->nr_pages == start_page) {
				880	/* Merge it */
				881	se->nr_pages += nr_pages;
				882	return 0;
				883	}
				884	lh = lh->next;
				885	}
				886
				887	/*
				888	* No merge. Insert a new extent, preserving ordering.
				889	*/
				890	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
				891	if (new_se == NULL)
				892	return -ENOMEM;
				893	new_se->start_page = start_page;
				894	new_se->nr_pages = nr_pages;
				895	new_se->start_block = start_block;
				896
				897	lh = sis->extent_list.prev; /* The lowest block */
				898	while (lh != &sis->extent_list) {
				899	se = list_entry(lh, struct swap_extent, list);
				900	if (se->start_block > start_block)
				901	break;
				902	lh = lh->prev;
				903	}
				904	list_add_tail(&new_se->list, lh);
				905	sis->nr_extents++;
				906	return 0;
				907	}
				908
				909	/*
				910	* A `swap extent' is a simple thing which maps a contiguous range of pages
				911	* onto a contiguous range of disk blocks. An ordered list of swap extents
				912	* is built at swapon time and is then used at swap_writepage/swap_readpage
				913	* time for locating where on disk a page belongs.
				914	*
				915	* If the swapfile is an S_ISBLK block device, a single extent is installed.
				916	* This is done so that the main operating code can treat S_ISBLK and S_ISREG
				917	* swap files identically.
				918	*
				919	* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
				920	* extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
				921	* swapfiles are handled identically after swapon time.
				922	*
				923	* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
				924	* and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
				925	* some stray blocks are found which do not fall within the PAGE_SIZE alignment
				926	* requirements, they are simply tossed out - we will never use those blocks
				927	* for swapping.
				928	*
Hugh Dickins	b0d9bcd	2005-09-03 15:54:31 -0700	[diff] [blame]	929	* For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	930	* prevents root from shooting her foot off by ftruncating an in-use swapfile,
				931	* which will scribble on the fs.
				932	*
				933	* The amount of disk space which a single swap extent represents varies.
				934	* Typically it is in the 1-4 megabyte range. So we can have hundreds of
				935	* extents in the list. To avoid much list walking, we cache the previous
				936	* search location in `curr_swap_extent', and start new searches from there.
				937	* This is extremely effective. The average number of iterations in
				938	* map_swap_page() has been measured at about 0.3 per page. - akpm.
				939	*/
				940	static int setup_swap_extents(struct swap_info_struct *sis)
				941	{
				942	struct inode *inode;
				943	unsigned blocks_per_page;
				944	unsigned long page_no;
				945	unsigned blkbits;
				946	sector_t probe_block;
				947	sector_t last_block;
				948	int ret;
				949
				950	inode = sis->swap_file->f_mapping->host;
				951	if (S_ISBLK(inode->i_mode)) {
				952	ret = add_swap_extent(sis, 0, sis->max, 0);
				953	goto done;
				954	}
				955
				956	blkbits = inode->i_blkbits;
				957	blocks_per_page = PAGE_SIZE >> blkbits;
				958
				959	/*
				960	* Map all the blocks into the extent list. This code doesn't try
				961	* to be very smart.
				962	*/
				963	probe_block = 0;
				964	page_no = 0;
				965	last_block = i_size_read(inode) >> blkbits;
				966	while ((probe_block + blocks_per_page) <= last_block &&
				967	page_no < sis->max) {
				968	unsigned block_in_page;
				969	sector_t first_block;
				970
				971	first_block = bmap(inode, probe_block);
				972	if (first_block == 0)
				973	goto bad_bmap;
				974
				975	/*
				976	* It must be PAGE_SIZE aligned on-disk
				977	*/
				978	if (first_block & (blocks_per_page - 1)) {
				979	probe_block++;
				980	goto reprobe;
				981	}
				982
				983	for (block_in_page = 1; block_in_page < blocks_per_page;
				984	block_in_page++) {
				985	sector_t block;
				986
				987	block = bmap(inode, probe_block + block_in_page);
				988	if (block == 0)
				989	goto bad_bmap;
				990	if (block != first_block + block_in_page) {
				991	/* Discontiguity */
				992	probe_block++;
				993	goto reprobe;
				994	}
				995	}
				996
				997	/*
				998	* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
				999	*/
				1000	ret = add_swap_extent(sis, page_no, 1,
				1001	first_block >> (PAGE_SHIFT - blkbits));
				1002	if (ret)
				1003	goto out;
				1004	page_no++;
				1005	probe_block += blocks_per_page;
				1006	reprobe:
				1007	continue;
				1008	}
				1009	ret = 0;
				1010	if (page_no == 0)
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1011	page_no = 1; /* force Empty message */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1012	sis->max = page_no;
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1013	sis->pages = page_no - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1014	sis->highest_bit = page_no - 1;
				1015	done:
				1016	sis->curr_swap_extent = list_entry(sis->extent_list.prev,
				1017	struct swap_extent, list);
				1018	goto out;
				1019	bad_bmap:
				1020	printk(KERN_ERR "swapon: swapfile has holes\n");
				1021	ret = -EINVAL;
				1022	out:
				1023	return ret;
				1024	}
				1025
				1026	#if 0 /* We don't need this yet */
				1027	#include <linux/backing-dev.h>
				1028	int page_queue_congested(struct page *page)
				1029	{
				1030	struct backing_dev_info *bdi;
				1031
				1032	BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
				1033
				1034	if (PageSwapCache(page)) {
				1035	swp_entry_t entry = { .val = page->private };
				1036	struct swap_info_struct *sis;
				1037
				1038	sis = get_swap_info_struct(swp_type(entry));
				1039	bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
				1040	} else
				1041	bdi = page->mapping->backing_dev_info;
				1042	return bdi_write_congested(bdi);
				1043	}
				1044	#endif
				1045
				1046	asmlinkage long sys_swapoff(const char __user * specialfile)
				1047	{
				1048	struct swap_info_struct * p = NULL;
				1049	unsigned short *swap_map;
				1050	struct file swap_file, victim;
				1051	struct address_space *mapping;
				1052	struct inode *inode;
				1053	char * pathname;
				1054	int i, type, prev;
				1055	int err;
				1056
				1057	if (!capable(CAP_SYS_ADMIN))
				1058	return -EPERM;
				1059
				1060	pathname = getname(specialfile);
				1061	err = PTR_ERR(pathname);
				1062	if (IS_ERR(pathname))
				1063	goto out;
				1064
				1065	victim = filp_open(pathname, O_RDWR\|O_LARGEFILE, 0);
				1066	putname(pathname);
				1067	err = PTR_ERR(victim);
				1068	if (IS_ERR(victim))
				1069	goto out;
				1070
				1071	mapping = victim->f_mapping;
				1072	prev = -1;
				1073	swap_list_lock();
				1074	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
				1075	p = swap_info + type;
				1076	if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
				1077	if (p->swap_file->f_mapping == mapping)
				1078	break;
				1079	}
				1080	prev = type;
				1081	}
				1082	if (type < 0) {
				1083	err = -EINVAL;
				1084	swap_list_unlock();
				1085	goto out_dput;
				1086	}
				1087	if (!security_vm_enough_memory(p->pages))
				1088	vm_unacct_memory(p->pages);
				1089	else {
				1090	err = -ENOMEM;
				1091	swap_list_unlock();
				1092	goto out_dput;
				1093	}
				1094	if (prev < 0) {
				1095	swap_list.head = p->next;
				1096	} else {
				1097	swap_info[prev].next = p->next;
				1098	}
				1099	if (type == swap_list.next) {
				1100	/* just pick something that's safe... */
				1101	swap_list.next = swap_list.head;
				1102	}
				1103	nr_swap_pages -= p->pages;
				1104	total_swap_pages -= p->pages;
				1105	p->flags &= ~SWP_WRITEOK;
				1106	swap_list_unlock();
				1107	current->flags \|= PF_SWAPOFF;
				1108	err = try_to_unuse(type);
				1109	current->flags &= ~PF_SWAPOFF;
				1110
				1111	/* wait for any unplug function to finish */
				1112	down_write(&swap_unplug_sem);
				1113	up_write(&swap_unplug_sem);
				1114
				1115	if (err) {
				1116	/* re-insert swap space back into swap_list */
				1117	swap_list_lock();
				1118	for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
				1119	if (p->prio >= swap_info[i].prio)
				1120	break;
				1121	p->next = i;
				1122	if (prev < 0)
				1123	swap_list.head = swap_list.next = p - swap_info;
				1124	else
				1125	swap_info[prev].next = p - swap_info;
				1126	nr_swap_pages += p->pages;
				1127	total_swap_pages += p->pages;
				1128	p->flags \|= SWP_WRITEOK;
				1129	swap_list_unlock();
				1130	goto out_dput;
				1131	}
Hugh Dickins	4cd3bb1	2005-09-03 15:54:33 -0700	[diff] [blame^]	1132	destroy_swap_extents(p);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1133	down(&swapon_sem);
				1134	swap_list_lock();
				1135	drain_mmlist();
				1136	swap_device_lock(p);
				1137	swap_file = p->swap_file;
				1138	p->swap_file = NULL;
				1139	p->max = 0;
				1140	swap_map = p->swap_map;
				1141	p->swap_map = NULL;
				1142	p->flags = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1143	swap_device_unlock(p);
				1144	swap_list_unlock();
				1145	up(&swapon_sem);
				1146	vfree(swap_map);
				1147	inode = mapping->host;
				1148	if (S_ISBLK(inode->i_mode)) {
				1149	struct block_device *bdev = I_BDEV(inode);
				1150	set_blocksize(bdev, p->old_block_size);
				1151	bd_release(bdev);
				1152	} else {
				1153	down(&inode->i_sem);
				1154	inode->i_flags &= ~S_SWAPFILE;
				1155	up(&inode->i_sem);
				1156	}
				1157	filp_close(swap_file, NULL);
				1158	err = 0;
				1159
				1160	out_dput:
				1161	filp_close(victim, NULL);
				1162	out:
				1163	return err;
				1164	}
				1165
				1166	#ifdef CONFIG_PROC_FS
				1167	/* iterator */
				1168	static void swap_start(struct seq_file swap, loff_t *pos)
				1169	{
				1170	struct swap_info_struct *ptr = swap_info;
				1171	int i;
				1172	loff_t l = *pos;
				1173
				1174	down(&swapon_sem);
				1175
				1176	for (i = 0; i < nr_swapfiles; i++, ptr++) {
				1177	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)
				1178	continue;
				1179	if (!l--)
				1180	return ptr;
				1181	}
				1182
				1183	return NULL;
				1184	}
				1185
				1186	static void swap_next(struct seq_file swap, void v, loff_t pos)
				1187	{
				1188	struct swap_info_struct *ptr = v;
				1189	struct swap_info_struct *endptr = swap_info + nr_swapfiles;
				1190
				1191	for (++ptr; ptr < endptr; ptr++) {
				1192	if (!(ptr->flags & SWP_USED) \|\| !ptr->swap_map)
				1193	continue;
				1194	++*pos;
				1195	return ptr;
				1196	}
				1197
				1198	return NULL;
				1199	}
				1200
				1201	static void swap_stop(struct seq_file swap, void v)
				1202	{
				1203	up(&swapon_sem);
				1204	}
				1205
				1206	static int swap_show(struct seq_file swap, void v)
				1207	{
				1208	struct swap_info_struct *ptr = v;
				1209	struct file *file;
				1210	int len;
				1211
				1212	if (v == swap_info)
				1213	seq_puts(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
				1214
				1215	file = ptr->swap_file;
				1216	len = seq_path(swap, file->f_vfsmnt, file->f_dentry, " \t\n\\");
				1217	seq_printf(swap, "%*s%s\t%d\t%ld\t%d\n",
				1218	len < 40 ? 40 - len : 1, " ",
				1219	S_ISBLK(file->f_dentry->d_inode->i_mode) ?
				1220	"partition" : "file\t",
				1221	ptr->pages << (PAGE_SHIFT - 10),
				1222	ptr->inuse_pages << (PAGE_SHIFT - 10),
				1223	ptr->prio);
				1224	return 0;
				1225	}
				1226
				1227	static struct seq_operations swaps_op = {
				1228	.start = swap_start,
				1229	.next = swap_next,
				1230	.stop = swap_stop,
				1231	.show = swap_show
				1232	};
				1233
				1234	static int swaps_open(struct inode inode, struct file file)
				1235	{
				1236	return seq_open(file, &swaps_op);
				1237	}
				1238
				1239	static struct file_operations proc_swaps_operations = {
				1240	.open = swaps_open,
				1241	.read = seq_read,
				1242	.llseek = seq_lseek,
				1243	.release = seq_release,
				1244	};
				1245
				1246	static int __init procswaps_init(void)
				1247	{
				1248	struct proc_dir_entry *entry;
				1249
				1250	entry = create_proc_entry("swaps", 0, NULL);
				1251	if (entry)
				1252	entry->proc_fops = &proc_swaps_operations;
				1253	return 0;
				1254	}
				1255	__initcall(procswaps_init);
				1256	#endif /* CONFIG_PROC_FS */
				1257
				1258	/*
				1259	* Written 01/25/92 by Simmule Turner, heavily changed by Linus.
				1260	*
				1261	* The swapon system call
				1262	*/
				1263	asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
				1264	{
				1265	struct swap_info_struct * p;
				1266	char *name = NULL;
				1267	struct block_device *bdev = NULL;
				1268	struct file *swap_file = NULL;
				1269	struct address_space *mapping;
				1270	unsigned int type;
				1271	int i, prev;
				1272	int error;
				1273	static int least_priority;
				1274	union swap_header *swap_header = NULL;
				1275	int swap_header_version;
				1276	int nr_good_pages = 0;
				1277	unsigned long maxpages = 1;
				1278	int swapfilesize;
				1279	unsigned short *swap_map;
				1280	struct page *page = NULL;
				1281	struct inode *inode = NULL;
				1282	int did_down = 0;
				1283
				1284	if (!capable(CAP_SYS_ADMIN))
				1285	return -EPERM;
				1286	swap_list_lock();
				1287	p = swap_info;
				1288	for (type = 0 ; type < nr_swapfiles ; type++,p++)
				1289	if (!(p->flags & SWP_USED))
				1290	break;
				1291	error = -EPERM;
				1292	/*
				1293	* Test if adding another swap device is possible. There are
				1294	* two limiting factors: 1) the number of bits for the swap
				1295	* type swp_entry_t definition and 2) the number of bits for
				1296	* the swap type in the swap ptes as defined by the different
				1297	* architectures. To honor both limitations a swap entry
				1298	* with swap offset 0 and swap type ~0UL is created, encoded
				1299	* to a swap pte, decoded to a swp_entry_t again and finally
				1300	* the swap type part is extracted. This will mask all bits
				1301	* from the initial ~0UL that can't be encoded in either the
				1302	* swp_entry_t or the architecture definition of a swap pte.
				1303	*/
				1304	if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
				1305	swap_list_unlock();
				1306	goto out;
				1307	}
				1308	if (type >= nr_swapfiles)
				1309	nr_swapfiles = type+1;
				1310	INIT_LIST_HEAD(&p->extent_list);
				1311	p->flags = SWP_USED;
				1312	p->nr_extents = 0;
				1313	p->swap_file = NULL;
				1314	p->old_block_size = 0;
				1315	p->swap_map = NULL;
				1316	p->lowest_bit = 0;
				1317	p->highest_bit = 0;
				1318	p->cluster_nr = 0;
				1319	p->inuse_pages = 0;
				1320	spin_lock_init(&p->sdev_lock);
				1321	p->next = -1;
				1322	if (swap_flags & SWAP_FLAG_PREFER) {
				1323	p->prio =
				1324	(swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
				1325	} else {
				1326	p->prio = --least_priority;
				1327	}
				1328	swap_list_unlock();
				1329	name = getname(specialfile);
				1330	error = PTR_ERR(name);
				1331	if (IS_ERR(name)) {
				1332	name = NULL;
				1333	goto bad_swap_2;
				1334	}
				1335	swap_file = filp_open(name, O_RDWR\|O_LARGEFILE, 0);
				1336	error = PTR_ERR(swap_file);
				1337	if (IS_ERR(swap_file)) {
				1338	swap_file = NULL;
				1339	goto bad_swap_2;
				1340	}
				1341
				1342	p->swap_file = swap_file;
				1343	mapping = swap_file->f_mapping;
				1344	inode = mapping->host;
				1345
				1346	error = -EBUSY;
				1347	for (i = 0; i < nr_swapfiles; i++) {
				1348	struct swap_info_struct *q = &swap_info[i];
				1349
				1350	if (i == type \|\| !q->swap_file)
				1351	continue;
				1352	if (mapping == q->swap_file->f_mapping)
				1353	goto bad_swap;
				1354	}
				1355
				1356	error = -EINVAL;
				1357	if (S_ISBLK(inode->i_mode)) {
				1358	bdev = I_BDEV(inode);
				1359	error = bd_claim(bdev, sys_swapon);
				1360	if (error < 0) {
				1361	bdev = NULL;
				1362	goto bad_swap;
				1363	}
				1364	p->old_block_size = block_size(bdev);
				1365	error = set_blocksize(bdev, PAGE_SIZE);
				1366	if (error < 0)
				1367	goto bad_swap;
				1368	p->bdev = bdev;
				1369	} else if (S_ISREG(inode->i_mode)) {
				1370	p->bdev = inode->i_sb->s_bdev;
				1371	down(&inode->i_sem);
				1372	did_down = 1;
				1373	if (IS_SWAPFILE(inode)) {
				1374	error = -EBUSY;
				1375	goto bad_swap;
				1376	}
				1377	} else {
				1378	goto bad_swap;
				1379	}
				1380
				1381	swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
				1382
				1383	/*
				1384	* Read the swap header.
				1385	*/
				1386	if (!mapping->a_ops->readpage) {
				1387	error = -EINVAL;
				1388	goto bad_swap;
				1389	}
				1390	page = read_cache_page(mapping, 0,
				1391	(filler_t *)mapping->a_ops->readpage, swap_file);
				1392	if (IS_ERR(page)) {
				1393	error = PTR_ERR(page);
				1394	goto bad_swap;
				1395	}
				1396	wait_on_page_locked(page);
				1397	if (!PageUptodate(page))
				1398	goto bad_swap;
				1399	kmap(page);
				1400	swap_header = page_address(page);
				1401
				1402	if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
				1403	swap_header_version = 1;
				1404	else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
				1405	swap_header_version = 2;
				1406	else {
				1407	printk("Unable to find swap-space signature\n");
				1408	error = -EINVAL;
				1409	goto bad_swap;
				1410	}
				1411
				1412	switch (swap_header_version) {
				1413	case 1:
				1414	printk(KERN_ERR "version 0 swap is no longer supported. "
				1415	"Use mkswap -v1 %s\n", name);
				1416	error = -EINVAL;
				1417	goto bad_swap;
				1418	case 2:
				1419	/* Check the swap header's sub-version and the size of
				1420	the swap file and bad block lists */
				1421	if (swap_header->info.version != 1) {
				1422	printk(KERN_WARNING
				1423	"Unable to handle swap header version %d\n",
				1424	swap_header->info.version);
				1425	error = -EINVAL;
				1426	goto bad_swap;
				1427	}
				1428
				1429	p->lowest_bit = 1;
				1430	/*
				1431	* Find out how many pages are allowed for a single swap
				1432	* device. There are two limiting factors: 1) the number of
				1433	* bits for the swap offset in the swp_entry_t type and
				1434	* 2) the number of bits in the a swap pte as defined by
				1435	* the different architectures. In order to find the
				1436	* largest possible bit mask a swap entry with swap type 0
				1437	* and swap offset ~0UL is created, encoded to a swap pte,
				1438	* decoded to a swp_entry_t again and finally the swap
				1439	* offset is extracted. This will mask all the bits from
				1440	* the initial ~0UL mask that can't be encoded in either
				1441	* the swp_entry_t or the architecture definition of a
				1442	* swap pte.
				1443	*/
				1444	maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1;
				1445	if (maxpages > swap_header->info.last_page)
				1446	maxpages = swap_header->info.last_page;
				1447	p->highest_bit = maxpages - 1;
				1448
				1449	error = -EINVAL;
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1450	if (!maxpages)
				1451	goto bad_swap;
				1452	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
				1453	goto bad_swap;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1454	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
				1455	goto bad_swap;
				1456
				1457	/* OK, set up the swap map and apply the bad block list */
				1458	if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
				1459	error = -ENOMEM;
				1460	goto bad_swap;
				1461	}
				1462
				1463	error = 0;
				1464	memset(p->swap_map, 0, maxpages * sizeof(short));
				1465	for (i=0; i<swap_header->info.nr_badpages; i++) {
				1466	int page = swap_header->info.badpages[i];
				1467	if (page <= 0 \|\| page >= swap_header->info.last_page)
				1468	error = -EINVAL;
				1469	else
				1470	p->swap_map[page] = SWAP_MAP_BAD;
				1471	}
				1472	nr_good_pages = swap_header->info.last_page -
				1473	swap_header->info.nr_badpages -
				1474	1 /* header page */;
				1475	if (error)
				1476	goto bad_swap;
				1477	}
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1478
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1479	if (swapfilesize && maxpages > swapfilesize) {
				1480	printk(KERN_WARNING
				1481	"Swap area shorter than signature indicates\n");
				1482	error = -EINVAL;
				1483	goto bad_swap;
				1484	}
Hugh Dickins	e2244ec	2005-09-03 15:54:32 -0700	[diff] [blame]	1485	if (nr_good_pages) {
				1486	p->swap_map[0] = SWAP_MAP_BAD;
				1487	p->max = maxpages;
				1488	p->pages = nr_good_pages;
				1489	error = setup_swap_extents(p);
				1490	if (error)
				1491	goto bad_swap;
				1492	nr_good_pages = p->pages;
				1493	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1494	if (!nr_good_pages) {
				1495	printk(KERN_WARNING "Empty swap-file\n");
				1496	error = -EINVAL;
				1497	goto bad_swap;
				1498	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1499
				1500	down(&swapon_sem);
				1501	swap_list_lock();
				1502	swap_device_lock(p);
				1503	p->flags = SWP_ACTIVE;
				1504	nr_swap_pages += nr_good_pages;
				1505	total_swap_pages += nr_good_pages;
				1506	printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n",
				1507	nr_good_pages<<(PAGE_SHIFT-10), name,
				1508	p->prio, p->nr_extents);
				1509
				1510	/* insert swap space into swap_list: */
				1511	prev = -1;
				1512	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
				1513	if (p->prio >= swap_info[i].prio) {
				1514	break;
				1515	}
				1516	prev = i;
				1517	}
				1518	p->next = i;
				1519	if (prev < 0) {
				1520	swap_list.head = swap_list.next = p - swap_info;
				1521	} else {
				1522	swap_info[prev].next = p - swap_info;
				1523	}
				1524	swap_device_unlock(p);
				1525	swap_list_unlock();
				1526	up(&swapon_sem);
				1527	error = 0;
				1528	goto out;
				1529	bad_swap:
				1530	if (bdev) {
				1531	set_blocksize(bdev, p->old_block_size);
				1532	bd_release(bdev);
				1533	}
Hugh Dickins	4cd3bb1	2005-09-03 15:54:33 -0700	[diff] [blame^]	1534	destroy_swap_extents(p);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1535	bad_swap_2:
				1536	swap_list_lock();
				1537	swap_map = p->swap_map;
				1538	p->swap_file = NULL;
				1539	p->swap_map = NULL;
				1540	p->flags = 0;
				1541	if (!(swap_flags & SWAP_FLAG_PREFER))
				1542	++least_priority;
				1543	swap_list_unlock();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1544	vfree(swap_map);
				1545	if (swap_file)
				1546	filp_close(swap_file, NULL);
				1547	out:
				1548	if (page && !IS_ERR(page)) {
				1549	kunmap(page);
				1550	page_cache_release(page);
				1551	}
				1552	if (name)
				1553	putname(name);
				1554	if (did_down) {
				1555	if (!error)
				1556	inode->i_flags \|= S_SWAPFILE;
				1557	up(&inode->i_sem);
				1558	}
				1559	return error;
				1560	}
				1561
				1562	void si_swapinfo(struct sysinfo *val)
				1563	{
				1564	unsigned int i;
				1565	unsigned long nr_to_be_unused = 0;
				1566
				1567	swap_list_lock();
				1568	for (i = 0; i < nr_swapfiles; i++) {
				1569	if (!(swap_info[i].flags & SWP_USED) \|\|
				1570	(swap_info[i].flags & SWP_WRITEOK))
				1571	continue;
				1572	nr_to_be_unused += swap_info[i].inuse_pages;
				1573	}
				1574	val->freeswap = nr_swap_pages + nr_to_be_unused;
				1575	val->totalswap = total_swap_pages + nr_to_be_unused;
				1576	swap_list_unlock();
				1577	}
				1578
				1579	/*
				1580	* Verify that a swap entry is valid and increment its swap map count.
				1581	*
				1582	* Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
				1583	* "permanent", but will be reclaimed by the next swapoff.
				1584	*/
				1585	int swap_duplicate(swp_entry_t entry)
				1586	{
				1587	struct swap_info_struct * p;
				1588	unsigned long offset, type;
				1589	int result = 0;
				1590
				1591	type = swp_type(entry);
				1592	if (type >= nr_swapfiles)
				1593	goto bad_file;
				1594	p = type + swap_info;
				1595	offset = swp_offset(entry);
				1596
				1597	swap_device_lock(p);
				1598	if (offset < p->max && p->swap_map[offset]) {
				1599	if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
				1600	p->swap_map[offset]++;
				1601	result = 1;
				1602	} else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
				1603	if (swap_overflow++ < 5)
				1604	printk(KERN_WARNING "swap_dup: swap entry overflow\n");
				1605	p->swap_map[offset] = SWAP_MAP_MAX;
				1606	result = 1;
				1607	}
				1608	}
				1609	swap_device_unlock(p);
				1610	out:
				1611	return result;
				1612
				1613	bad_file:
				1614	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
				1615	goto out;
				1616	}
				1617
				1618	struct swap_info_struct *
				1619	get_swap_info_struct(unsigned type)
				1620	{
				1621	return &swap_info[type];
				1622	}
				1623
				1624	/*
				1625	* swap_device_lock prevents swap_map being freed. Don't grab an extra
				1626	* reference on the swaphandle, it doesn't matter if it becomes unused.
				1627	*/
				1628	int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
				1629	{
				1630	int ret = 0, i = 1 << page_cluster;
				1631	unsigned long toff;
				1632	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
				1633
				1634	if (!page_cluster) /* no readahead */
				1635	return 0;
				1636	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
				1637	if (!toff) /* first page is swap header */
				1638	toff++, i--;
				1639	*offset = toff;
				1640
				1641	swap_device_lock(swapdev);
				1642	do {
				1643	/* Don't read-ahead past the end of the swap area */
				1644	if (toff >= swapdev->max)
				1645	break;
				1646	/* Don't read in free or bad pages */
				1647	if (!swapdev->swap_map[toff])
				1648	break;
				1649	if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
				1650	break;
				1651	toff++;
				1652	ret++;
				1653	} while (--i);
				1654	swap_device_unlock(swapdev);
				1655	return ret;
				1656	}