csky: Add flush_icache_mm to defer flush icache all

Some CPUs don't support icache.va instruction to maintain the whole
smp cores' icache. Using icache.all + IPI casue a lot on performace
and using defer mechanism could reduce the number of calling icache
_flush_all functions.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index a737027..d3e0420 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -48,6 +48,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u
 
 #define flush_icache_page(vma, page)		do {} while (0);
 #define flush_icache_range(start, end)		cache_wbinv_range(start, end)
+#define flush_icache_mm_range(mm, start, end)	cache_wbinv_range(start, end)
+#define flush_icache_deferred(mm)		do {} while (0);
 
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 do { \
diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c
index ba46995..790f1eb 100644
--- a/arch/csky/abiv2/cacheflush.c
+++ b/arch/csky/abiv2/cacheflush.c
@@ -28,3 +28,58 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 
 	kunmap_atomic((void *) addr);
 }
+
+void flush_icache_deferred(struct mm_struct *mm)
+{
+	unsigned int cpu = smp_processor_id();
+	cpumask_t *mask = &mm->context.icache_stale_mask;
+
+	if (cpumask_test_cpu(cpu, mask)) {
+		cpumask_clear_cpu(cpu, mask);
+		/*
+		 * Ensure the remote hart's writes are visible to this hart.
+		 * This pairs with a barrier in flush_icache_mm.
+		 */
+		smp_mb();
+		local_icache_inv_all(NULL);
+	}
+}
+
+void flush_icache_mm_range(struct mm_struct *mm,
+		unsigned long start, unsigned long end)
+{
+	unsigned int cpu;
+	cpumask_t others, *mask;
+
+	preempt_disable();
+
+#ifdef CONFIG_CPU_HAS_ICACHE_INS
+	if (mm == current->mm) {
+		icache_inv_range(start, end);
+		preempt_enable();
+		return;
+	}
+#endif
+
+	/* Mark every hart's icache as needing a flush for this MM. */
+	mask = &mm->context.icache_stale_mask;
+	cpumask_setall(mask);
+
+	/* Flush this hart's I$ now, and mark it as flushed. */
+	cpu = smp_processor_id();
+	cpumask_clear_cpu(cpu, mask);
+	local_icache_inv_all(NULL);
+
+	/*
+	 * Flush the I$ of other harts concurrently executing, and mark them as
+	 * flushed.
+	 */
+	cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu));
+
+	if (mm != current->active_mm || !cpumask_empty(&others)) {
+		on_each_cpu_mask(&others, local_icache_inv_all, NULL, 1);
+		cpumask_clear(mask);
+	}
+
+	preempt_enable();
+}
diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h
index 28b7c32..a565e00 100644
--- a/arch/csky/abiv2/inc/abi/cacheflush.h
+++ b/arch/csky/abiv2/inc/abi/cacheflush.h
@@ -31,15 +31,23 @@ static inline void flush_dcache_page(struct page *page)
 
 #define flush_icache_range(start, end)		cache_wbinv_range(start, end)
 
+void flush_icache_mm_range(struct mm_struct *mm,
+			unsigned long start, unsigned long end);
+void flush_icache_deferred(struct mm_struct *mm);
+
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)
 
 #define copy_to_user_page(vma, page, vaddr, dst, src, len) \
 do { \
 	memcpy(dst, src, len); \
-	if (vma->vm_flags & VM_EXEC) \
-		cache_wbinv_range((unsigned long)dst, \
-				  (unsigned long)dst + len); \
+	if (vma->vm_flags & VM_EXEC) { \
+		dcache_wb_range((unsigned long)dst, \
+				(unsigned long)dst + len); \
+		flush_icache_mm_range(current->mm, \
+				(unsigned long)dst, \
+				(unsigned long)dst + len); \
+		} \
 } while (0)
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy(dst, src, len)
diff --git a/arch/csky/include/asm/cacheflush.h b/arch/csky/include/asm/cacheflush.h
index a96da67..f0b8f25 100644
--- a/arch/csky/include/asm/cacheflush.h
+++ b/arch/csky/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
 #ifndef __ASM_CSKY_CACHEFLUSH_H
 #define __ASM_CSKY_CACHEFLUSH_H
 
+#include <linux/mm.h>
 #include <abi/cacheflush.h>
 
 #endif /* __ASM_CSKY_CACHEFLUSH_H */
diff --git a/arch/csky/include/asm/mmu.h b/arch/csky/include/asm/mmu.h
index b382a14..26fbb1d 100644
--- a/arch/csky/include/asm/mmu.h
+++ b/arch/csky/include/asm/mmu.h
@@ -7,6 +7,7 @@
 typedef struct {
 	atomic64_t	asid;
 	void *vdso;
+	cpumask_t	icache_stale_mask;
 } mm_context_t;
 
 #endif /* __ASM_CSKY_MMU_H */
diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h
index 0285b0a..abdf1f1 100644
--- a/arch/csky/include/asm/mmu_context.h
+++ b/arch/csky/include/asm/mmu_context.h
@@ -43,5 +43,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	TLBMISS_HANDLER_SETUP_PGD(next->pgd);
 	write_mmu_entryhi(next->context.asid.counter);
+
+	flush_icache_deferred(next);
 }
 #endif /* __ASM_CSKY_MMU_CONTEXT_H */
diff --git a/arch/csky/mm/syscache.c b/arch/csky/mm/syscache.c
index c4645e4..ffade2f 100644
--- a/arch/csky/mm/syscache.c
+++ b/arch/csky/mm/syscache.c
@@ -3,7 +3,7 @@
 
 #include <linux/syscalls.h>
 #include <asm/page.h>
-#include <asm/cache.h>
+#include <asm/cacheflush.h>
 #include <asm/cachectl.h>
 
 SYSCALL_DEFINE3(cacheflush,
@@ -13,17 +13,14 @@ SYSCALL_DEFINE3(cacheflush,
 {
 	switch (cache) {
 	case ICACHE:
-		icache_inv_range((unsigned long)addr,
-				 (unsigned long)addr + bytes);
-		break;
+	case BCACHE:
+		flush_icache_mm_range(current->mm,
+				(unsigned long)addr,
+				(unsigned long)addr + bytes);
 	case DCACHE:
 		dcache_wb_range((unsigned long)addr,
 				(unsigned long)addr + bytes);
 		break;
-	case BCACHE:
-		cache_wbinv_range((unsigned long)addr,
-				  (unsigned long)addr + bytes);
-		break;
 	default:
 		return -EINVAL;
 	}