Blame - mm/readahead.c - SHIFTPHONES/mainline/linux

blob: c094e4f5a25070e74e1119bdccc7560f57729655 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* mm/readahead.c - address_space-level file readahead.
				3	*
				4	* Copyright (C) 2002, Linus Torvalds
				5	*
				6	* 09Apr2002 akpm@zip.com.au
				7	* Initial version.
				8	*/
				9
				10	#include <linux/kernel.h>
				11	#include <linux/fs.h>
				12	#include <linux/mm.h>
				13	#include <linux/module.h>
				14	#include <linux/blkdev.h>
				15	#include <linux/backing-dev.h>
Andrew Morton	8bde37f	2006-12-10 02:19:40 -0800	[diff] [blame]	16	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	17	#include <linux/pagevec.h>
				18
				19	void default_unplug_io_fn(struct backing_dev_info bdi, struct page page)
				20	{
				21	}
				22	EXPORT_SYMBOL(default_unplug_io_fn);
				23
Fengguang Wu	f615bfc	2007-07-19 01:47:58 -0700	[diff] [blame]	24	/*
				25	* Convienent macros for min/max read-ahead pages.
				26	* Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
				27	* The latter is necessary for systems with large page size(i.e. 64k).
				28	*/
				29	#define MAX_RA_PAGES (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
				30	#define MIN_RA_PAGES DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
				31
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	32	struct backing_dev_info default_backing_dev_info = {
Fengguang Wu	f615bfc	2007-07-19 01:47:58 -0700	[diff] [blame]	33	.ra_pages = MAX_RA_PAGES,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	34	.state = 0,
				35	.capabilities = BDI_CAP_MAP_COPY,
				36	.unplug_io_fn = default_unplug_io_fn,
				37	};
				38	EXPORT_SYMBOL_GPL(default_backing_dev_info);
				39
				40	/*
				41	* Initialise a struct file's readahead state. Assumes that the caller has
				42	* memset *ra to zero.
				43	*/
				44	void
				45	file_ra_state_init(struct file_ra_state ra, struct address_space mapping)
				46	{
				47	ra->ra_pages = mapping->backing_dev_info->ra_pages;
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	48	ra->prev_index = -1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	49	}
Steven Whitehouse	d41cc70	2006-01-30 08:53:33 +0000	[diff] [blame]	50	EXPORT_SYMBOL_GPL(file_ra_state_init);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	51
				52	/*
				53	* Return max readahead size for this inode in number-of-pages.
				54	*/
				55	static inline unsigned long get_max_readahead(struct file_ra_state *ra)
				56	{
				57	return ra->ra_pages;
				58	}
				59
				60	static inline unsigned long get_min_readahead(struct file_ra_state *ra)
				61	{
Fengguang Wu	f615bfc	2007-07-19 01:47:58 -0700	[diff] [blame]	62	return MIN_RA_PAGES;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	63	}
				64
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	65	static inline void reset_ahead_window(struct file_ra_state *ra)
				66	{
				67	/*
				68	* ... but preserve ahead_start + ahead_size value,
				69	* see 'recheck:' label in page_cache_readahead().
				70	* Note: We never use ->ahead_size as rvalue without
				71	* checking ->ahead_start != 0 first.
				72	*/
				73	ra->ahead_size += ra->ahead_start;
				74	ra->ahead_start = 0;
				75	}
				76
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	77	static inline void ra_off(struct file_ra_state *ra)
				78	{
				79	ra->start = 0;
				80	ra->flags = 0;
				81	ra->size = 0;
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	82	reset_ahead_window(ra);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	83	return;
				84	}
				85
				86	/*
				87	* Set the initial window size, round to next power of 2 and square
				88	* for small size, x 4 for medium, and x 2 for large
				89	* for 128k (32 page) max ra
				90	* 1-8 page = 32k initial, > 8 page = 128k initial
				91	*/
				92	static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
				93	{
				94	unsigned long newsize = roundup_pow_of_two(size);
				95
Steven Pratt	aed75ff	2006-03-22 00:08:48 -0800	[diff] [blame]	96	if (newsize <= max / 32)
				97	newsize = newsize * 4;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	98	else if (newsize <= max / 4)
Steven Pratt	aed75ff	2006-03-22 00:08:48 -0800	[diff] [blame]	99	newsize = newsize * 2;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	100	else
				101	newsize = max;
				102	return newsize;
				103	}
				104
				105	/*
				106	* Set the new window size, this is called only when I/O is to be submitted,
				107	* not for each call to readahead. If a cache miss occured, reduce next I/O
				108	* size, else increase depending on how close to max we are.
				109	*/
				110	static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
				111	{
				112	unsigned long max = get_max_readahead(ra);
				113	unsigned long min = get_min_readahead(ra);
				114	unsigned long cur = ra->size;
				115	unsigned long newsize;
				116
				117	if (ra->flags & RA_FLAG_MISS) {
				118	ra->flags &= ~RA_FLAG_MISS;
				119	newsize = max((cur - 2), min);
				120	} else if (cur < max / 16) {
				121	newsize = 4 * cur;
				122	} else {
				123	newsize = 2 * cur;
				124	}
				125	return min(newsize, max);
				126	}
				127
				128	#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
				129
				130	/**
Randy Dunlap	bd40cdd	2006-06-25 05:48:08 -0700	[diff] [blame]	131	* read_cache_pages - populate an address space with some pages & start reads against them
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	132	* @mapping: the address_space
				133	* @pages: The address of a list_head which contains the target pages. These
				134	* pages have their ->index populated and are otherwise uninitialised.
				135	* @filler: callback routine for filling a single page.
				136	* @data: private data for the callback routine.
				137	*
				138	* Hides the details of the LRU cache etc from the filesystems.
				139	*/
				140	int read_cache_pages(struct address_space mapping, struct list_head pages,
				141	int (filler)(void , struct page ), void data)
				142	{
				143	struct page *page;
				144	struct pagevec lru_pvec;
				145	int ret = 0;
				146
				147	pagevec_init(&lru_pvec, 0);
				148
				149	while (!list_empty(pages)) {
				150	page = list_to_page(pages);
				151	list_del(&page->lru);
				152	if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
				153	page_cache_release(page);
				154	continue;
				155	}
				156	ret = filler(data, page);
				157	if (!pagevec_add(&lru_pvec, page))
				158	__pagevec_lru_add(&lru_pvec);
				159	if (ret) {
OGAWA Hirofumi	38da288	2006-12-06 20:36:46 -0800	[diff] [blame]	160	put_pages_list(pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	161	break;
				162	}
Andrew Morton	8bde37f	2006-12-10 02:19:40 -0800	[diff] [blame]	163	task_io_account_read(PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164	}
				165	pagevec_lru_add(&lru_pvec);
				166	return ret;
				167	}
				168
				169	EXPORT_SYMBOL(read_cache_pages);
				170
				171	static int read_pages(struct address_space mapping, struct file filp,
				172	struct list_head *pages, unsigned nr_pages)
				173	{
				174	unsigned page_idx;
				175	struct pagevec lru_pvec;
Zach Brown	994fc28c	2005-12-15 14:28:17 -0800	[diff] [blame]	176	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	177
				178	if (mapping->a_ops->readpages) {
				179	ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
OGAWA Hirofumi	029e332	2006-11-02 22:07:06 -0800	[diff] [blame]	180	/* Clean up the remaining pages */
				181	put_pages_list(pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	182	goto out;
				183	}
				184
				185	pagevec_init(&lru_pvec, 0);
				186	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
				187	struct page *page = list_to_page(pages);
				188	list_del(&page->lru);
				189	if (!add_to_page_cache(page, mapping,
				190	page->index, GFP_KERNEL)) {
Zach Brown	9f1a3cf	2006-06-25 05:46:46 -0700	[diff] [blame]	191	mapping->a_ops->readpage(filp, page);
				192	if (!pagevec_add(&lru_pvec, page))
				193	__pagevec_lru_add(&lru_pvec);
				194	} else
				195	page_cache_release(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	196	}
				197	pagevec_lru_add(&lru_pvec);
Zach Brown	994fc28c	2005-12-15 14:28:17 -0800	[diff] [blame]	198	ret = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199	out:
				200	return ret;
				201	}
				202
				203	/*
				204	* Readahead design.
				205	*
				206	* The fields in struct file_ra_state represent the most-recently-executed
				207	* readahead attempt:
				208	*
				209	* start: Page index at which we started the readahead
				210	* size: Number of pages in that read
				211	* Together, these form the "current window".
				212	* Together, start and size represent the `readahead window'.
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	213	* prev_index: The page which the readahead algorithm most-recently inspected.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	214	* It is mainly used to detect sequential file reading.
				215	* If page_cache_readahead sees that it is again being called for
				216	* a page which it just looked at, it can return immediately without
				217	* making any state changes.
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	218	* offset: Offset in the prev_index where the last read ended - used for
Jan Kara	ec0f163	2007-05-06 14:49:25 -0700	[diff] [blame]	219	* detection of sequential file reading.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	220	* ahead_start,
				221	* ahead_size: Together, these form the "ahead window".
				222	* ra_pages: The externally controlled max readahead for this fd.
				223	*
				224	* When readahead is in the off state (size == 0), readahead is disabled.
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	225	* In this state, prev_index is used to detect the resumption of sequential I/O.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	226	*
				227	* The readahead code manages two windows - the "current" and the "ahead"
				228	* windows. The intent is that while the application is walking the pages
				229	* in the current window, I/O is underway on the ahead window. When the
				230	* current window is fully traversed, it is replaced by the ahead window
				231	* and the ahead window is invalidated. When this copying happens, the
				232	* new current window's pages are probably still locked. So
				233	* we submit a new batch of I/O immediately, creating a new ahead window.
				234	*
				235	* So:
				236	*
				237	* ----\|----------------\|----------------\|-----
				238	* ^start ^start+size
				239	* ^ahead_start ^ahead_start+ahead_size
				240	*
				241	* ^ When this page is read, we submit I/O for the
				242	* ahead window.
				243	*
				244	* A `readahead hit' occurs when a read request is made against a page which is
				245	* the next sequential page. Ahead window calculations are done only when it
				246	* is time to submit a new IO. The code ramps up the size agressively at first,
				247	* but slow down as it approaches max_readhead.
				248	*
				249	* Any seek/ramdom IO will result in readahead being turned off. It will resume
				250	* at the first sequential access.
				251	*
				252	* There is a special-case: if the first page which the application tries to
				253	* read happens to be the first page of the file, it is assumed that a linear
				254	* read is about to happen and the window is immediately set to the initial size
				255	* based on I/O request size and the max_readahead.
				256	*
				257	* This function is to be called for every read request, rather than when
				258	* it is time to perform readahead. It is called only once for the entire I/O
				259	* regardless of size unless readahead is unable to start enough I/O to satisfy
				260	* the request (I/O request > max_readahead).
				261	*/
				262
				263	/*
				264	* do_page_cache_readahead actually reads a chunk of disk. It allocates all
				265	* the pages first, then submits them all for I/O. This avoids the very bad
				266	* behaviour which would occur if page allocations are causing VM writeback.
				267	* We really don't want to intermingle reads and writes like that.
				268	*
				269	* Returns the number of pages requested, or the maximum amount of I/O allowed.
				270	*
				271	* do_page_cache_readahead() returns -1 if it encountered request queue
				272	* congestion.
				273	*/
				274	static int
				275	__do_page_cache_readahead(struct address_space mapping, struct file filp,
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	276	pgoff_t offset, unsigned long nr_to_read,
				277	unsigned long lookahead_size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	{
				279	struct inode *inode = mapping->host;
				280	struct page *page;
				281	unsigned long end_index; /* The last page we want to read */
				282	LIST_HEAD(page_pool);
				283	int page_idx;
				284	int ret = 0;
				285	loff_t isize = i_size_read(inode);
				286
				287	if (isize == 0)
				288	goto out;
				289
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	290	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	291
				292	/*
				293	* Preallocate as many pages as we will need.
				294	*/
				295	read_lock_irq(&mapping->tree_lock);
				296	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	297	pgoff_t page_offset = offset + page_idx;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	298
				299	if (page_offset > end_index)
				300	break;
				301
				302	page = radix_tree_lookup(&mapping->page_tree, page_offset);
				303	if (page)
				304	continue;
				305
				306	read_unlock_irq(&mapping->tree_lock);
				307	page = page_cache_alloc_cold(mapping);
				308	read_lock_irq(&mapping->tree_lock);
				309	if (!page)
				310	break;
				311	page->index = page_offset;
				312	list_add(&page->lru, &page_pool);
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	313	if (page_idx == nr_to_read - lookahead_size)
				314	SetPageReadahead(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	315	ret++;
				316	}
				317	read_unlock_irq(&mapping->tree_lock);
				318
				319	/*
				320	* Now start the IO. We ignore I/O errors - if the page is not
				321	* uptodate then the caller will launch readpage again, and
				322	* will then handle the error.
				323	*/
				324	if (ret)
				325	read_pages(mapping, filp, &page_pool, ret);
				326	BUG_ON(!list_empty(&page_pool));
				327	out:
				328	return ret;
				329	}
				330
				331	/*
				332	* Chunk the readahead into 2 megabyte units, so that we don't pin too much
				333	* memory at once.
				334	*/
				335	int force_page_cache_readahead(struct address_space mapping, struct file filp,
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	336	pgoff_t offset, unsigned long nr_to_read)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	337	{
				338	int ret = 0;
				339
				340	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
				341	return -EINVAL;
				342
				343	while (nr_to_read) {
				344	int err;
				345
				346	unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
				347
				348	if (this_chunk > nr_to_read)
				349	this_chunk = nr_to_read;
				350	err = __do_page_cache_readahead(mapping, filp,
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	351	offset, this_chunk, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	352	if (err < 0) {
				353	ret = err;
				354	break;
				355	}
				356	ret += err;
				357	offset += this_chunk;
				358	nr_to_read -= this_chunk;
				359	}
				360	return ret;
				361	}
				362
				363	/*
				364	* Check how effective readahead is being. If the amount of started IO is
				365	* less than expected then the file is partly or fully in pagecache and
				366	* readahead isn't helping.
				367	*
				368	*/
				369	static inline int check_ra_success(struct file_ra_state *ra,
				370	unsigned long nr_to_read, unsigned long actual)
				371	{
				372	if (actual == 0) {
				373	ra->cache_hit += nr_to_read;
				374	if (ra->cache_hit >= VM_MAX_CACHE_HIT) {
				375	ra_off(ra);
				376	ra->flags \|= RA_FLAG_INCACHE;
				377	return 0;
				378	}
				379	} else {
				380	ra->cache_hit=0;
				381	}
				382	return 1;
				383	}
				384
				385	/*
				386	* This version skips the IO if the queue is read-congested, and will tell the
				387	* block layer to abandon the readahead if request allocation would block.
				388	*
				389	* force_page_cache_readahead() will ignore queue congestion and will block on
				390	* request queues.
				391	*/
				392	int do_page_cache_readahead(struct address_space mapping, struct file filp,
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	393	pgoff_t offset, unsigned long nr_to_read)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	394	{
				395	if (bdi_read_congested(mapping->backing_dev_info))
				396	return -1;
				397
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	398	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	399	}
				400
				401	/*
				402	* Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
				403	* is set wait till the read completes. Otherwise attempt to read without
				404	* blocking.
Andreas Mohr	d6e05ed	2006-06-26 18:35:02 +0200	[diff] [blame]	405	* Returns 1 meaning 'success' if read is successful without switching off
				406	* readahead mode. Otherwise return failure.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	407	*/
				408	static int
				409	blockable_page_cache_readahead(struct address_space mapping, struct file filp,
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	410	pgoff_t offset, unsigned long nr_to_read,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	411	struct file_ra_state *ra, int block)
				412	{
				413	int actual;
				414
				415	if (!block && bdi_read_congested(mapping->backing_dev_info))
				416	return 0;
				417
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	418	actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	419
				420	return check_ra_success(ra, nr_to_read, actual);
				421	}
				422
				423	static int make_ahead_window(struct address_space mapping, struct file filp,
				424	struct file_ra_state *ra, int force)
				425	{
				426	int block, ret;
				427
				428	ra->ahead_size = get_next_ra_size(ra);
				429	ra->ahead_start = ra->start + ra->size;
				430
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	431	block = force \|\| (ra->prev_index >= ra->ahead_start);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432	ret = blockable_page_cache_readahead(mapping, filp,
				433	ra->ahead_start, ra->ahead_size, ra, block);
				434
				435	if (!ret && !force) {
				436	/* A read failure in blocking mode, implies pages are
				437	* all cached. So we can safely assume we have taken
				438	* care of all the pages requested in this call.
				439	* A read failure in non-blocking mode, implies we are
				440	* reading more pages than requested in this call. So
				441	* we safely assume we have taken care of all the pages
				442	* requested in this call.
				443	*
				444	* Just reset the ahead window in case we failed due to
				445	* congestion. The ahead window will any way be closed
				446	* in case we failed due to excessive page cache hits.
				447	*/
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	448	reset_ahead_window(ra);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	449	}
				450
				451	return ret;
				452	}
				453
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	454	/**
				455	* page_cache_readahead - generic adaptive readahead
				456	* @mapping: address_space which holds the pagecache and I/O vectors
				457	* @ra: file_ra_state which holds the readahead state
				458	* @filp: passed on to ->readpage() and ->readpages()
				459	* @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
				460	* @req_size: hint: total size of the read which the caller is performing in
				461	* PAGE_CACHE_SIZE units
				462	*
Fengguang Wu	46fc3e7	2007-07-19 01:47:57 -0700	[diff] [blame]	463	* page_cache_readahead() is the main function. It performs the adaptive
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	464	* readahead window size management and submits the readahead I/O.
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	465	*
				466	* Note that @filp is purely used for passing on to the ->readpage[s]()
				467	* handler: it may refer to a different file from @mapping (so we may not use
Josef Sipek	e9536ae	2006-12-08 02:37:21 -0800	[diff] [blame]	468	* @filp->f_mapping or @filp->f_path.dentry->d_inode here).
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	469	* Also, @ra may not be equal to &@filp->f_ra.
				470	*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	471	*/
				472	unsigned long
				473	page_cache_readahead(struct address_space mapping, struct file_ra_state ra,
Andrew Morton	7361f4d	2005-11-07 00:59:28 -0800	[diff] [blame]	474	struct file *filp, pgoff_t offset, unsigned long req_size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	{
				476	unsigned long max, newsize;
				477	int sequential;
				478
				479	/*
				480	* We avoid doing extra work and bogusly perturbing the readahead
				481	* window expansion logic.
				482	*/
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	483	if (offset == ra->prev_index && --req_size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	484	++offset;
				485
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	486	/* Note that prev_index == -1 if it is a first read */
				487	sequential = (offset == ra->prev_index + 1);
				488	ra->prev_index = offset;
				489	ra->prev_offset = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	490
				491	max = get_max_readahead(ra);
				492	newsize = min(req_size, max);
				493
				494	/* No readahead or sub-page sized read or file already in cache */
				495	if (newsize == 0 \|\| (ra->flags & RA_FLAG_INCACHE))
				496	goto out;
				497
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	498	ra->prev_index += newsize - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	499
				500	/*
				501	* Special case - first read at start of file. We'll assume it's
				502	* a whole-file read and grow the window fast. Or detect first
				503	* sequential access
				504	*/
				505	if (sequential && ra->size == 0) {
				506	ra->size = get_init_ra_size(newsize, max);
				507	ra->start = offset;
				508	if (!blockable_page_cache_readahead(mapping, filp, offset,
				509	ra->size, ra, 1))
				510	goto out;
				511
				512	/*
				513	* If the request size is larger than our max readahead, we
				514	* at least want to be sure that we get 2 IOs in flight and
				515	* we know that we will definitly need the new I/O.
				516	* once we do this, subsequent calls should be able to overlap
				517	* IOs,* thus preventing stalls. so issue the ahead window
				518	* immediately.
				519	*/
				520	if (req_size >= max)
				521	make_ahead_window(mapping, filp, ra, 1);
				522
				523	goto out;
				524	}
				525
				526	/*
				527	* Now handle the random case:
				528	* partial page reads and first access were handled above,
				529	* so this must be the next page otherwise it is random
				530	*/
				531	if (!sequential) {
				532	ra_off(ra);
				533	blockable_page_cache_readahead(mapping, filp, offset,
				534	newsize, ra, 1);
				535	goto out;
				536	}
				537
				538	/*
				539	* If we get here we are doing sequential IO and this was not the first
				540	* occurence (ie we have an existing window)
				541	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	542	if (ra->ahead_start == 0) { /* no ahead window yet */
				543	if (!make_ahead_window(mapping, filp, ra, 0))
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	544	goto recheck;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	545	}
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	546
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	547	/*
				548	* Already have an ahead window, check if we crossed into it.
				549	* If so, shift windows and issue a new ahead window.
				550	* Only return the #pages that are in the current window, so that
				551	* we get called back on the first page of the ahead window which
				552	* will allow us to submit more IO.
				553	*/
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	554	if (ra->prev_index >= ra->ahead_start) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	555	ra->start = ra->ahead_start;
				556	ra->size = ra->ahead_size;
				557	make_ahead_window(mapping, filp, ra, 0);
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	558	recheck:
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	559	/* prev_index shouldn't overrun the ahead window */
				560	ra->prev_index = min(ra->prev_index,
Oleg Nesterov	a564da3	2006-03-22 00:08:47 -0800	[diff] [blame]	561	ra->ahead_start + ra->ahead_size - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	562	}
				563
				564	out:
Jan Kara	6ce745e	2007-05-06 14:49:26 -0700	[diff] [blame]	565	return ra->prev_index + 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	566	}
Andrew Morton	d8733c2	2006-03-23 03:00:11 -0800	[diff] [blame]	567	EXPORT_SYMBOL_GPL(page_cache_readahead);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	568
				569	/*
				570	* handle_ra_miss() is called when it is known that a page which should have
				571	* been present in the pagecache (we just did some readahead there) was in fact
				572	* not found. This will happen if it was evicted by the VM (readahead
				573	* thrashing)
				574	*
				575	* Turn on the cache miss flag in the RA struct, this will cause the RA code
				576	* to reduce the RA size on the next read.
				577	*/
				578	void handle_ra_miss(struct address_space *mapping,
				579	struct file_ra_state *ra, pgoff_t offset)
				580	{
				581	ra->flags \|= RA_FLAG_MISS;
				582	ra->flags &= ~RA_FLAG_INCACHE;
Steven Pratt	3b30bbd	2005-09-06 15:17:06 -0700	[diff] [blame]	583	ra->cache_hit = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	584	}
				585
				586	/*
				587	* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
				588	* sensible upper limit.
				589	*/
				590	unsigned long max_sane_readahead(unsigned long nr)
				591	{
Christoph Lameter	05a0416	2007-02-10 01:43:05 -0800	[diff] [blame]	592	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
				593	+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	594	}
Fengguang Wu	5ce1110	2007-07-19 01:47:59 -0700	[diff] [blame]	595
				596	/*
				597	* Submit IO for the read-ahead request in file_ra_state.
				598	*/
				599	unsigned long ra_submit(struct file_ra_state *ra,
				600	struct address_space mapping, struct file filp)
				601	{
				602	unsigned long ra_size;
				603	unsigned long la_size;
				604	int actual;
				605
				606	ra_size = ra_readahead_size(ra);
				607	la_size = ra_lookahead_size(ra);
				608	actual = __do_page_cache_readahead(mapping, filp,
				609	ra->ra_index, ra_size, la_size);
				610
				611	return actual;
				612	}
				613	EXPORT_SYMBOL_GPL(ra_submit);
Fengguang Wu	122a21d	2007-07-19 01:48:01 -0700	[diff] [blame]	614
				615	/*
				616	* Get the previous window size, ramp it up, and
				617	* return it as the new window size.
				618	*/
				619	static unsigned long get_next_ra_size2(struct file_ra_state *ra,
				620	unsigned long max)
				621	{
				622	unsigned long cur = ra->readahead_index - ra->ra_index;
				623	unsigned long newsize;
				624
				625	if (cur < max / 16)
				626	newsize = cur * 4;
				627	else
				628	newsize = cur * 2;
				629
				630	return min(newsize, max);
				631	}
				632
				633	/*
				634	* On-demand readahead design.
				635	*
				636	* The fields in struct file_ra_state represent the most-recently-executed
				637	* readahead attempt:
				638	*
				639	* \|-------- last readahead window -------->\|
				640	* \|-- application walking here -->\|
				641	* ======#============\|==================#=====================\|
				642	* ^la_index ^ra_index ^lookahead_index ^readahead_index
				643	*
				644	* [ra_index, readahead_index) represents the last readahead window.
				645	*
				646	* [la_index, lookahead_index] is where the application would be walking(in
				647	* the common case of cache-cold sequential reads): the last window was
				648	* established when the application was at la_index, and the next window will
				649	* be bring in when the application reaches lookahead_index.
				650	*
				651	* To overlap application thinking time and disk I/O time, we do
				652	* `readahead pipelining': Do not wait until the application consumed all
				653	* readahead pages and stalled on the missing page at readahead_index;
				654	* Instead, submit an asynchronous readahead I/O as early as the application
				655	* reads on the page at lookahead_index. Normally lookahead_index will be
				656	* equal to ra_index, for maximum pipelining.
				657	*
				658	* In interleaved sequential reads, concurrent streams on the same fd can
				659	* be invalidating each other's readahead state. So we flag the new readahead
				660	* page at lookahead_index with PG_readahead, and use it as readahead
				661	* indicator. The flag won't be set on already cached pages, to avoid the
				662	* readahead-for-nothing fuss, saving pointless page cache lookups.
				663	*
				664	* prev_index tracks the last visited page in the _previous_ read request.
				665	* It should be maintained by the caller, and will be used for detecting
				666	* small random reads. Note that the readahead algorithm checks loosely
				667	* for sequential patterns. Hence interleaved reads might be served as
				668	* sequential ones.
				669	*
				670	* There is a special-case: if the first page which the application tries to
				671	* read happens to be the first page of the file, it is assumed that a linear
				672	* read is about to happen and the window is immediately set to the initial size
				673	* based on I/O request size and the max_readahead.
				674	*
				675	* The code ramps up the readahead size aggressively at first, but slow down as
				676	* it approaches max_readhead.
				677	*/
				678
				679	/*
				680	* A minimal readahead algorithm for trivial sequential/random reads.
				681	*/
				682	static unsigned long
				683	ondemand_readahead(struct address_space *mapping,
				684	struct file_ra_state ra, struct file filp,
				685	struct page *page, pgoff_t offset,
				686	unsigned long req_size)
				687	{
				688	unsigned long max; /* max readahead pages */
				689	pgoff_t ra_index; /* readahead index */
				690	unsigned long ra_size; /* readahead size */
				691	unsigned long la_size; /* lookahead size */
				692	int sequential;
				693
				694	max = ra->ra_pages;
				695	sequential = (offset - ra->prev_index <= 1UL) \|\| (req_size > max);
				696
				697	/*
				698	* Lookahead/readahead hit, assume sequential access.
				699	* Ramp up sizes, and push forward the readahead window.
				700	*/
				701	if (offset && (offset == ra->lookahead_index \|\|
				702	offset == ra->readahead_index)) {
				703	ra_index = ra->readahead_index;
				704	ra_size = get_next_ra_size2(ra, max);
				705	la_size = ra_size;
				706	goto fill_ra;
				707	}
				708
				709	/*
				710	* Standalone, small read.
				711	* Read as is, and do not pollute the readahead state.
				712	*/
				713	if (!page && !sequential) {
				714	return __do_page_cache_readahead(mapping, filp,
				715	offset, req_size, 0);
				716	}
				717
				718	/*
				719	* It may be one of
				720	* - first read on start of file
				721	* - sequential cache miss
				722	* - oversize random read
				723	* Start readahead for it.
				724	*/
				725	ra_index = offset;
				726	ra_size = get_init_ra_size(req_size, max);
				727	la_size = ra_size > req_size ? ra_size - req_size : ra_size;
				728
				729	/*
				730	* Hit on a lookahead page without valid readahead state.
				731	* E.g. interleaved reads.
				732	* Not knowing its readahead pos/size, bet on the minimal possible one.
				733	*/
				734	if (page) {
				735	ra_index++;
				736	ra_size = min(4 * ra_size, max);
				737	}
				738
				739	fill_ra:
				740	ra_set_index(ra, offset, ra_index);
				741	ra_set_size(ra, ra_size, la_size);
				742
				743	return ra_submit(ra, mapping, filp);
				744	}
				745
				746	/**
				747	* page_cache_readahead_ondemand - generic file readahead
				748	* @mapping: address_space which holds the pagecache and I/O vectors
				749	* @ra: file_ra_state which holds the readahead state
				750	* @filp: passed on to ->readpage() and ->readpages()
				751	* @page: the page at @offset, or NULL if non-present
				752	* @offset: start offset into @mapping, in PAGE_CACHE_SIZE units
				753	* @req_size: hint: total size of the read which the caller is performing in
				754	* PAGE_CACHE_SIZE units
				755	*
				756	* page_cache_readahead_ondemand() is the entry point of readahead logic.
				757	* This function should be called when it is time to perform readahead:
				758	* 1) @page == NULL
				759	* A cache miss happened, time for synchronous readahead.
				760	* 2) @page != NULL && PageReadahead(@page)
				761	* A look-ahead hit occured, time for asynchronous readahead.
				762	*/
				763	unsigned long
				764	page_cache_readahead_ondemand(struct address_space *mapping,
				765	struct file_ra_state ra, struct file filp,
				766	struct page *page, pgoff_t offset,
				767	unsigned long req_size)
				768	{
				769	/* no read-ahead */
				770	if (!ra->ra_pages)
				771	return 0;
				772
				773	if (page) {
				774	ClearPageReadahead(page);
				775
				776	/*
				777	* Defer asynchronous read-ahead on IO congestion.
				778	*/
				779	if (bdi_read_congested(mapping->backing_dev_info))
				780	return 0;
				781	}
				782
				783	/* do read-ahead */
				784	return ondemand_readahead(mapping, ra, filp, page,
				785	offset, req_size);
				786	}
				787	EXPORT_SYMBOL_GPL(page_cache_readahead_ondemand);