Hibernation: Fix mark_nosave_pages()

There is a problem in the hibernation code that triggers on some NUMA
systems on which pfn_valid() returns 'true' for some PFNs that don't
belong to any zone.  Namely, there is a BUG_ON() in
memory_bm_find_bit() that triggers for PFNs not belonging to any
zone and passing the pfn_valid() test.  On the affected systems it
triggers when we mark PFNs reported by the platform as not saveable,
because the PFNs in question belong to a region mapped directly using
iorepam() (i.e. the ACPI data area) and they pass the pfn_valid()
test.

Modify memory_bm_find_bit() so that it returns an error if given PFN
doesn't belong to any zone instead of crashing the kernel and ignore
the result returned by it in mark_nosave_pages(), while marking the
"nosave" memory regions.

This doesn't affect the hibernation functionality, as we won't touch
the PFNs in question anyway.

http://bugzilla.kernel.org/show_bug.cgi?id=9966 .

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020c..5f91a07 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@
  *	of @bm->cur_zone_bm are updated.
  */
 
-static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 				void **addr, unsigned int *bit_nr)
 {
 	struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@
 		while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
 			zone_bm = zone_bm->next;
 
-			BUG_ON(!zone_bm);
+			if (!zone_bm)
+				return -EFAULT;
 		}
 		bm->cur.zone_bm = zone_bm;
 	}
@@ -479,23 +480,40 @@
 	pfn -= bb->start_pfn;
 	*bit_nr = pfn % BM_BITS_PER_CHUNK;
 	*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+	return 0;
 }
 
 static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
+	int error;
 
-	memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	BUG_ON(error);
 	set_bit(bit, addr);
 }
 
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+{
+	void *addr;
+	unsigned int bit;
+	int error;
+
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	if (!error)
+		set_bit(bit, addr);
+	return error;
+}
+
 static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
 	void *addr;
 	unsigned int bit;
+	int error;
 
-	memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	BUG_ON(error);
 	clear_bit(bit, addr);
 }
 
@@ -503,8 +521,10 @@
 {
 	void *addr;
 	unsigned int bit;
+	int error;
 
-	memory_bm_find_bit(bm, pfn, &addr, &bit);
+	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+	BUG_ON(error);
 	return test_bit(bit, addr);
 }
 
@@ -709,8 +729,15 @@
 				region->end_pfn << PAGE_SHIFT);
 
 		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
-			if (pfn_valid(pfn))
-				memory_bm_set_bit(bm, pfn);
+			if (pfn_valid(pfn)) {
+				/*
+				 * It is safe to ignore the result of
+				 * mem_bm_set_bit_check() here, since we won't
+				 * touch the PFNs for which the error is
+				 * returned anyway.
+				 */
+				mem_bm_set_bit_check(bm, pfn);
+			}
 	}
 }