Blame - arch/tile/mm/pgtable.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 2c850d9864e38b8b7e4c4163853c4c6704bd7908 [file] [log] [blame]

Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	1	/*
				2	* Copyright 2010 Tilera Corporation. All Rights Reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation, version 2.
				7	*
				8	* This program is distributed in the hope that it will be useful, but
				9	* WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
				11	* NON INFRINGEMENT. See the GNU General Public License for
				12	* more details.
				13	*/
				14
				15	#include <linux/sched.h>
				16	#include <linux/kernel.h>
				17	#include <linux/errno.h>
				18	#include <linux/mm.h>
				19	#include <linux/swap.h>
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	20	#include <linux/highmem.h>
				21	#include <linux/slab.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/spinlock.h>
				24	#include <linux/cpumask.h>
				25	#include <linux/module.h>
				26	#include <linux/io.h>
				27	#include <linux/vmalloc.h>
				28	#include <linux/smp.h>
				29
				30	#include <asm/system.h>
				31	#include <asm/pgtable.h>
				32	#include <asm/pgalloc.h>
				33	#include <asm/fixmap.h>
				34	#include <asm/tlb.h>
				35	#include <asm/tlbflush.h>
				36	#include <asm/homecache.h>
				37
				38	#define K(x) ((x) << (PAGE_SHIFT-10))
				39
				40	/*
				41	* The normal show_free_areas() is too verbose on Tile, with dozens
				42	* of processors and often four NUMA zones each with high and lowmem.
				43	*/
				44	void show_mem(void)
				45	{
				46	struct zone *zone;
				47
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	48	pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu"
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	49	" free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu"
				50	" pagecache:%lu swap:%lu\n",
				51	(global_page_state(NR_ACTIVE_ANON) +
				52	global_page_state(NR_ACTIVE_FILE)),
				53	(global_page_state(NR_INACTIVE_ANON) +
				54	global_page_state(NR_INACTIVE_FILE)),
				55	global_page_state(NR_FILE_DIRTY),
				56	global_page_state(NR_WRITEBACK),
				57	global_page_state(NR_UNSTABLE_NFS),
				58	global_page_state(NR_FREE_PAGES),
				59	(global_page_state(NR_SLAB_RECLAIMABLE) +
				60	global_page_state(NR_SLAB_UNRECLAIMABLE)),
				61	global_page_state(NR_FILE_MAPPED),
				62	global_page_state(NR_PAGETABLE),
				63	global_page_state(NR_BOUNCE),
				64	global_page_state(NR_FILE_PAGES),
				65	nr_swap_pages);
				66
				67	for_each_zone(zone) {
				68	unsigned long flags, order, total = 0, largest_order = -1;
				69
				70	if (!populated_zone(zone))
				71	continue;
				72
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	73	spin_lock_irqsave(&zone->lock, flags);
				74	for (order = 0; order < MAX_ORDER; order++) {
				75	int nr = zone->free_area[order].nr_free;
				76	total += nr << order;
				77	if (nr)
				78	largest_order = order;
				79	}
				80	spin_unlock_irqrestore(&zone->lock, flags);
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	81	pr_err("Node %d %7s: %lukB (largest %luKb)\n",
				82	zone_to_nid(zone), zone->name,
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	83	K(total), largest_order ? K(1UL) << largest_order : 0);
				84	}
				85	}
				86
				87	/*
				88	* Associate a virtual page frame with a given physical page frame
				89	* and protection flags for that frame.
				90	*/
				91	static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
				92	{
				93	pgd_t *pgd;
				94	pud_t *pud;
				95	pmd_t *pmd;
				96	pte_t *pte;
				97
				98	pgd = swapper_pg_dir + pgd_index(vaddr);
				99	if (pgd_none(*pgd)) {
				100	BUG();
				101	return;
				102	}
				103	pud = pud_offset(pgd, vaddr);
				104	if (pud_none(*pud)) {
				105	BUG();
				106	return;
				107	}
				108	pmd = pmd_offset(pud, vaddr);
				109	if (pmd_none(*pmd)) {
				110	BUG();
				111	return;
				112	}
				113	pte = pte_offset_kernel(pmd, vaddr);
				114	/* <pfn,flags> stored as-is, to permit clearing entries */
				115	set_pte(pte, pfn_pte(pfn, flags));
				116
				117	/*
				118	* It's enough to flush this one mapping.
				119	* This appears conservative since it is only called
				120	* from __set_fixmap.
				121	*/
				122	local_flush_tlb_page(NULL, vaddr, PAGE_SIZE);
				123	}
				124
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	125	void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
				126	{
				127	unsigned long address = __fix_to_virt(idx);
				128
				129	if (idx >= __end_of_fixed_addresses) {
				130	BUG();
				131	return;
				132	}
				133	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
				134	}
				135
				136	#if defined(CONFIG_HIGHPTE)
Chris Metcalf	38a6f42	2010-11-01 15:21:35 -0400	[diff] [blame]	137	pte_t _pte_offset_map(pmd_t dir, unsigned long address)
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	138	{
Chris Metcalf	38a6f42	2010-11-01 15:21:35 -0400	[diff] [blame]	139	pte_t pte = kmap_atomic(pmd_page(dir)) +
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	140	(pmd_ptfn(*dir) << HV_LOG2_PAGE_TABLE_ALIGN) & ~PAGE_MASK;
				141	return &pte[pte_index(address)];
				142	}
				143	#endif
				144
				145	/*
				146	* List of all pgd's needed so it can invalidate entries in both cached
				147	* and uncached pgd's. This is essentially codepath-based locking
				148	* against pageattr.c; it is the unique case in which a valid change
				149	* of kernel pagetables can't be lazily synchronized by vmalloc faults.
				150	* vmalloc faults work because attached pagetables are never freed.
				151	* The locking scheme was chosen on the basis of manfred's
				152	* recommendations and having no core impact whatsoever.
				153	* -- wli
				154	*/
				155	DEFINE_SPINLOCK(pgd_lock);
				156	LIST_HEAD(pgd_list);
				157
				158	static inline void pgd_list_add(pgd_t *pgd)
				159	{
				160	list_add(pgd_to_list(pgd), &pgd_list);
				161	}
				162
				163	static inline void pgd_list_del(pgd_t *pgd)
				164	{
				165	list_del(pgd_to_list(pgd));
				166	}
				167
				168	#define KERNEL_PGD_INDEX_START pgd_index(PAGE_OFFSET)
				169	#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_INDEX_START)
				170
				171	static void pgd_ctor(pgd_t *pgd)
				172	{
				173	unsigned long flags;
				174
				175	memset(pgd, 0, KERNEL_PGD_INDEX_START*sizeof(pgd_t));
				176	spin_lock_irqsave(&pgd_lock, flags);
				177
				178	#ifndef __tilegx__
				179	/*
				180	* Check that the user interrupt vector has no L2.
				181	* It never should for the swapper, and new page tables
				182	* should always start with an empty user interrupt vector.
				183	*/
				184	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
				185	#endif
				186
				187	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
				188	swapper_pg_dir + KERNEL_PGD_INDEX_START,
				189	KERNEL_PGD_PTRS);
				190
				191	pgd_list_add(pgd);
				192	spin_unlock_irqrestore(&pgd_lock, flags);
				193	}
				194
				195	static void pgd_dtor(pgd_t *pgd)
				196	{
				197	unsigned long flags; /* can be called from interrupt context */
				198
				199	spin_lock_irqsave(&pgd_lock, flags);
				200	pgd_list_del(pgd);
				201	spin_unlock_irqrestore(&pgd_lock, flags);
				202	}
				203
				204	pgd_t pgd_alloc(struct mm_struct mm)
				205	{
				206	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
				207	if (pgd)
				208	pgd_ctor(pgd);
				209	return pgd;
				210	}
				211
				212	void pgd_free(struct mm_struct mm, pgd_t pgd)
				213	{
				214	pgd_dtor(pgd);
				215	kmem_cache_free(pgd_cache, pgd);
				216	}
				217
				218
				219	#define L2_USER_PGTABLE_PAGES (1 << L2_USER_PGTABLE_ORDER)
				220
				221	struct page pte_alloc_one(struct mm_struct mm, unsigned long address)
				222	{
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	223	gfp_t flags = GFP_KERNEL\|__GFP_REPEAT\|__GFP_ZERO\|__GFP_COMP;
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	224	struct page *p;
				225
				226	#ifdef CONFIG_HIGHPTE
				227	flags \|= __GFP_HIGHMEM;
				228	#endif
				229
				230	p = alloc_pages(flags, L2_USER_PGTABLE_ORDER);
				231	if (p == NULL)
				232	return NULL;
				233
				234	pgtable_page_ctor(p);
				235	return p;
				236	}
				237
				238	/*
				239	* Free page immediately (used in __pte_alloc if we raced with another
				240	* process). We have to correct whatever pte_alloc_one() did before
				241	* returning the pages to the allocator.
				242	*/
				243	void pte_free(struct mm_struct mm, struct page p)
				244	{
				245	pgtable_page_dtor(p);
				246	__free_pages(p, L2_USER_PGTABLE_ORDER);
				247	}
				248
				249	void __pte_free_tlb(struct mmu_gather tlb, struct page pte,
				250	unsigned long address)
				251	{
				252	int i;
				253
				254	pgtable_page_dtor(pte);
Peter Zijlstra	342d87e	2011-01-25 18:31:12 +0100	[diff] [blame^]	255	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
				256	tlb_remove_page(tlb, pte + i);
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	257	}
				258
				259	#ifndef __tilegx__
				260
				261	/*
				262	* FIXME: needs to be atomic vs hypervisor writes. For now we make the
				263	* window of vulnerability a bit smaller by doing an unlocked 8-bit update.
				264	*/
				265	int ptep_test_and_clear_young(struct vm_area_struct *vma,
				266	unsigned long addr, pte_t *ptep)
				267	{
				268	#if HV_PTE_INDEX_ACCESSED < 8 \|\| HV_PTE_INDEX_ACCESSED >= 16
				269	# error Code assumes HV_PTE "accessed" bit in second byte
				270	#endif
				271	u8 tmp = (u8 )ptep;
				272	u8 second_byte = tmp[1];
				273	if (!(second_byte & (1 << (HV_PTE_INDEX_ACCESSED - 8))))
				274	return 0;
				275	tmp[1] = second_byte & ~(1 << (HV_PTE_INDEX_ACCESSED - 8));
				276	return 1;
				277	}
				278
				279	/*
				280	* This implementation is atomic vs hypervisor writes, since the hypervisor
				281	* always writes the low word (where "accessed" and "dirty" are) and this
				282	* routine only writes the high word.
				283	*/
				284	void ptep_set_wrprotect(struct mm_struct *mm,
				285	unsigned long addr, pte_t *ptep)
				286	{
				287	#if HV_PTE_INDEX_WRITABLE < 32
				288	# error Code assumes HV_PTE "writable" bit in high word
				289	#endif
				290	u32 tmp = (u32 )ptep;
				291	tmp[1] = tmp[1] & ~(1 << (HV_PTE_INDEX_WRITABLE - 32));
				292	}
				293
				294	#endif
				295
				296	pte_t virt_to_pte(struct mm_struct mm, unsigned long addr)
				297	{
				298	pgd_t *pgd;
				299	pud_t *pud;
				300	pmd_t *pmd;
				301
				302	if (pgd_addr_invalid(addr))
				303	return NULL;
				304
				305	pgd = mm ? pgd_offset(mm, addr) : swapper_pg_dir + pgd_index(addr);
				306	pud = pud_offset(pgd, addr);
				307	if (!pud_present(*pud))
				308	return NULL;
				309	pmd = pmd_offset(pud, addr);
				310	if (pmd_huge_page(*pmd))
				311	return (pte_t *)pmd;
				312	if (!pmd_present(*pmd))
				313	return NULL;
				314	return pte_offset_kernel(pmd, addr);
				315	}
				316
				317	pgprot_t set_remote_cache_cpu(pgprot_t prot, int cpu)
				318	{
				319	unsigned int width = smp_width;
				320	int x = cpu % width;
				321	int y = cpu / width;
				322	BUG_ON(y >= smp_height);
				323	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
				324	BUG_ON(cpu < 0 \|\| cpu >= NR_CPUS);
				325	BUG_ON(!cpu_is_valid_lotar(cpu));
				326	return hv_pte_set_lotar(prot, HV_XY_TO_LOTAR(x, y));
				327	}
				328
				329	int get_remote_cache_cpu(pgprot_t prot)
				330	{
				331	HV_LOTAR lotar = hv_pte_get_lotar(prot);
				332	int x = HV_LOTAR_X(lotar);
				333	int y = HV_LOTAR_Y(lotar);
				334	BUG_ON(hv_pte_get_mode(prot) != HV_PTE_MODE_CACHE_TILE_L3);
				335	return x + y * smp_width;
				336	}
				337
				338	void set_pte_order(pte_t *ptep, pte_t pte, int order)
				339	{
				340	unsigned long pfn = pte_pfn(pte);
				341	struct page *page = pfn_to_page(pfn);
				342
				343	/* Update the home of a PTE if necessary */
				344	pte = pte_set_home(pte, page_home(page));
				345
				346	#ifdef __tilegx__
				347	*ptep = pte;
				348	#else
				349	/*
				350	* When setting a PTE, write the high bits first, then write
				351	* the low bits. This sets the "present" bit only after the
				352	* other bits are in place. If a particular PTE update
				353	* involves transitioning from one valid PTE to another, it
				354	* may be necessary to call set_pte_order() more than once,
				355	* transitioning via a suitable intermediate state.
				356	* Note that this sequence also means that if we are transitioning
				357	* from any migrating PTE to a non-migrating one, we will not
				358	* see a half-updated PTE with the migrating bit off.
				359	*/
				360	#if HV_PTE_INDEX_PRESENT >= 32 \|\| HV_PTE_INDEX_MIGRATING >= 32
				361	# error Must write the present and migrating bits last
				362	#endif
				363	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
				364	barrier();
				365	((u32 *)ptep)[0] = (u32)(pte_val(pte));
				366	#endif
				367	}
				368
				369	/* Can this mm load a PTE with cached_priority set? */
				370	static inline int mm_is_priority_cached(struct mm_struct *mm)
				371	{
				372	return mm->context.priority_cached;
				373	}
				374
				375	/*
				376	* Add a priority mapping to an mm_context and
				377	* notify the hypervisor if this is the first one.
				378	*/
				379	void start_mm_caching(struct mm_struct *mm)
				380	{
				381	if (!mm_is_priority_cached(mm)) {
				382	mm->context.priority_cached = -1U;
				383	hv_set_caching(-1U);
				384	}
				385	}
				386
				387	/*
				388	* Validate and return the priority_cached flag. We know if it's zero
				389	* that we don't need to scan, since we immediately set it non-zero
				390	* when we first consider a MAP_CACHE_PRIORITY mapping.
				391	*
				392	* We only _try_ to acquire the mmap_sem semaphore; if we can't acquire it,
				393	* since we're in an interrupt context (servicing switch_mm) we don't
				394	* worry about it and don't unset the "priority_cached" field.
				395	* Presumably we'll come back later and have more luck and clear
				396	* the value then; for now we'll just keep the cache marked for priority.
				397	*/
				398	static unsigned int update_priority_cached(struct mm_struct *mm)
				399	{
				400	if (mm->context.priority_cached && down_write_trylock(&mm->mmap_sem)) {
				401	struct vm_area_struct *vm;
				402	for (vm = mm->mmap; vm; vm = vm->vm_next) {
				403	if (hv_pte_get_cached_priority(vm->vm_page_prot))
				404	break;
				405	}
				406	if (vm == NULL)
				407	mm->context.priority_cached = 0;
				408	up_write(&mm->mmap_sem);
				409	}
				410	return mm->context.priority_cached;
				411	}
				412
				413	/* Set caching correctly for an mm that we are switching to. */
				414	void check_mm_caching(struct mm_struct prev, struct mm_struct next)
				415	{
				416	if (!mm_is_priority_cached(next)) {
				417	/*
				418	* If the new mm doesn't use priority caching, just see if we
				419	* need the hv_set_caching(), or can assume it's already zero.
				420	*/
				421	if (mm_is_priority_cached(prev))
				422	hv_set_caching(0);
				423	} else {
				424	hv_set_caching(update_priority_cached(next));
				425	}
				426	}
				427
				428	#if CHIP_HAS_MMIO()
				429
				430	/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
				431	void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
				432	pgprot_t home)
				433	{
				434	void *addr;
				435	struct vm_struct *area;
				436	unsigned long offset, last_addr;
				437	pgprot_t pgprot;
				438
				439	/* Don't allow wraparound or zero size */
				440	last_addr = phys_addr + size - 1;
				441	if (!size \|\| last_addr < phys_addr)
				442	return NULL;
				443
				444	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
				445	pgprot = PAGE_KERNEL;
				446	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
				447	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
				448
				449	/*
				450	* Mappings have to be page-aligned
				451	*/
				452	offset = phys_addr & ~PAGE_MASK;
				453	phys_addr &= PAGE_MASK;
				454	size = PAGE_ALIGN(last_addr+1) - phys_addr;
				455
				456	/*
				457	* Ok, go for it..
				458	*/
				459	area = get_vm_area(size, VM_IOREMAP /* \| other flags? */);
				460	if (!area)
				461	return NULL;
				462	area->phys_addr = phys_addr;
				463	addr = area->addr;
				464	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
				465	phys_addr, pgprot)) {
				466	remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr));
				467	return NULL;
				468	}
				469	return (__force void __iomem ) (offset + (char )addr);
				470	}
				471	EXPORT_SYMBOL(ioremap_prot);
				472
				473	/* Map a PCI MMIO bus address into VA space. */
				474	void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
				475	{
				476	panic("ioremap for PCI MMIO is not supported");
				477	}
				478	EXPORT_SYMBOL(ioremap);
				479
				480	/* Unmap an MMIO VA mapping. */
				481	void iounmap(volatile void __iomem *addr_in)
				482	{
				483	volatile void __iomem addr = (volatile void __iomem )
				484	(PAGE_MASK & (unsigned long __force)addr_in);
				485	#if 1
				486	vunmap((void * __force)addr);
				487	#else
				488	/* x86 uses this complicated flow instead of vunmap(). Is
				489	* there any particular reason we should do the same? */
				490	struct vm_struct p, o;
				491
				492	/* Use the vm area unlocked, assuming the caller
				493	ensures there isn't another iounmap for the same address
				494	in parallel. Reuse of the virtual address is prevented by
				495	leaving it in the global lists until we're done with it.
				496	cpa takes care of the direct mappings. */
				497	read_lock(&vmlist_lock);
				498	for (p = vmlist; p; p = p->next) {
				499	if (p->addr == addr)
				500	break;
				501	}
				502	read_unlock(&vmlist_lock);
				503
				504	if (!p) {
Chris Metcalf	0707ad3	2010-06-25 17:04:17 -0400	[diff] [blame]	505	pr_err("iounmap: bad address %p\n", addr);
Chris Metcalf	867e359	2010-05-28 23:09:12 -0400	[diff] [blame]	506	dump_stack();
				507	return;
				508	}
				509
				510	/* Finally remove it */
				511	o = remove_vm_area((void *)addr);
				512	BUG_ON(p != o \|\| o == NULL);
				513	kfree(p);
				514	#endif
				515	}
				516	EXPORT_SYMBOL(iounmap);
				517
				518	#endif /* CHIP_HAS_MMIO() */