parisc: Fix some PTE/TLB race conditions and optimize __flush_tlb_range based on timing results
The increased use of pdtlb/pitlb instructions seemed to increase the
frequency of random segmentation faults building packages. Further, we
had a number of cases where TLB inserts would repeatedly fail and all
forward progress would stop. The Haskell ghc package caused a lot of
trouble in this area. The final indication of a race in pte handling was
this syslog entry on sibaris (C8000):
swap_free: Unused swap offset entry 00000004
BUG: Bad page map in process mysqld pte:00000100 pmd:019bbec5
addr:00000000ec464000 vm_flags:00100073 anon_vma:0000000221023828 mapping: (null) index:ec464
CPU: 1 PID: 9176 Comm: mysqld Not tainted 4.0.0-2-parisc64-smp #1 Debian 4.0.5-1
Backtrace:
[<0000000040173eb0>] show_stack+0x20/0x38
[<0000000040444424>] dump_stack+0x9c/0x110
[<00000000402a0d38>] print_bad_pte+0x1a8/0x278
[<00000000402a28b8>] unmap_single_vma+0x3d8/0x770
[<00000000402a4090>] zap_page_range+0xf0/0x198
[<00000000402ba2a4>] SyS_madvise+0x404/0x8c0
Note that the pte value is 0 except for the accessed bit 0x100. This bit
shouldn't be set without the present bit.
It should be noted that the madvise system call is probably a trigger for many
of the random segmentation faults.
In looking at the kernel code, I found the following problems:
1) The pte_clear define didn't take TLB lock when clearing a pte.
2) We didn't test pte present bit inside lock in exception support.
3) The pte and tlb locks needed to merged in order to ensure consistency
between page table and TLB. This also has the effect of serializing TLB
broadcasts on SMP systems.
The attached change implements the above and a few other tweaks to try
to improve performance. Based on the timing code, TLB purges are very
slow (e.g., ~ 209 cycles per page on rp3440). Thus, I think it
beneficial to test the split_tlb variable to avoid duplicate purges.
Probably, all PA 2.0 machines have combined TLBs.
I dropped using __flush_tlb_range in flush_tlb_mm as I realized all
applications and most threads have a stack size that is too large to
make this useful. I added some comments to this effect.
Since implementing 1 through 3, I haven't had any random segmentation
faults on mx3210 (rp3440) in about one week of building code and running
as a Debian buildd.
Signed-off-by: John David Anglin <dave.anglin@bell.net>
Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Helge Deller <deller@gmx.de>
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 0a18375..f93c4a4 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -16,7 +16,7 @@
#include <asm/processor.h>
#include <asm/cache.h>
-extern spinlock_t pa_dbit_lock;
+extern spinlock_t pa_tlb_lock;
/*
* kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel
@@ -33,6 +33,19 @@
*/
#define kern_addr_valid(addr) (1)
+/* Purge data and instruction TLB entries. Must be called holding
+ * the pa_tlb_lock. The TLB purge instructions are slow on SMP
+ * machines since the purge must be broadcast to all CPUs.
+ */
+
+static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
+{
+ mtsp(mm->context, 1);
+ pdtlb(addr);
+ if (unlikely(split_tlb))
+ pitlb(addr);
+}
+
/* Certain architectures need to do special things when PTEs
* within a page table are directly modified. Thus, the following
* hook is made available.
@@ -42,15 +55,20 @@
*(pteptr) = (pteval); \
} while(0)
-extern void purge_tlb_entries(struct mm_struct *, unsigned long);
+#define pte_inserted(x) \
+ ((pte_val(x) & (_PAGE_PRESENT|_PAGE_ACCESSED)) \
+ == (_PAGE_PRESENT|_PAGE_ACCESSED))
-#define set_pte_at(mm, addr, ptep, pteval) \
- do { \
+#define set_pte_at(mm, addr, ptep, pteval) \
+ do { \
+ pte_t old_pte; \
unsigned long flags; \
- spin_lock_irqsave(&pa_dbit_lock, flags); \
- set_pte(ptep, pteval); \
- purge_tlb_entries(mm, addr); \
- spin_unlock_irqrestore(&pa_dbit_lock, flags); \
+ spin_lock_irqsave(&pa_tlb_lock, flags); \
+ old_pte = *ptep; \
+ set_pte(ptep, pteval); \
+ if (pte_inserted(old_pte)) \
+ purge_tlb_entries(mm, addr); \
+ spin_unlock_irqrestore(&pa_tlb_lock, flags); \
} while (0)
#endif /* !__ASSEMBLY__ */
@@ -268,7 +286,7 @@
#define pte_none(x) (pte_val(x) == 0)
#define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
-#define pte_clear(mm,addr,xp) do { pte_val(*(xp)) = 0; } while (0)
+#define pte_clear(mm, addr, xp) set_pte_at(mm, addr, xp, __pte(0))
#define pmd_flag(x) (pmd_val(x) & PxD_FLAG_MASK)
#define pmd_address(x) ((unsigned long)(pmd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT)
@@ -435,15 +453,15 @@
if (!pte_young(*ptep))
return 0;
- spin_lock_irqsave(&pa_dbit_lock, flags);
+ spin_lock_irqsave(&pa_tlb_lock, flags);
pte = *ptep;
if (!pte_young(pte)) {
- spin_unlock_irqrestore(&pa_dbit_lock, flags);
+ spin_unlock_irqrestore(&pa_tlb_lock, flags);
return 0;
}
set_pte(ptep, pte_mkold(pte));
purge_tlb_entries(vma->vm_mm, addr);
- spin_unlock_irqrestore(&pa_dbit_lock, flags);
+ spin_unlock_irqrestore(&pa_tlb_lock, flags);
return 1;
}
@@ -453,11 +471,12 @@
pte_t old_pte;
unsigned long flags;
- spin_lock_irqsave(&pa_dbit_lock, flags);
+ spin_lock_irqsave(&pa_tlb_lock, flags);
old_pte = *ptep;
- pte_clear(mm,addr,ptep);
- purge_tlb_entries(mm, addr);
- spin_unlock_irqrestore(&pa_dbit_lock, flags);
+ set_pte(ptep, __pte(0));
+ if (pte_inserted(old_pte))
+ purge_tlb_entries(mm, addr);
+ spin_unlock_irqrestore(&pa_tlb_lock, flags);
return old_pte;
}
@@ -465,10 +484,10 @@
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
unsigned long flags;
- spin_lock_irqsave(&pa_dbit_lock, flags);
+ spin_lock_irqsave(&pa_tlb_lock, flags);
set_pte(ptep, pte_wrprotect(*ptep));
purge_tlb_entries(mm, addr);
- spin_unlock_irqrestore(&pa_dbit_lock, flags);
+ spin_unlock_irqrestore(&pa_tlb_lock, flags);
}
#define pte_same(A,B) (pte_val(A) == pte_val(B))