iommu/arm-smmu: don't bother truncating the s1 output size to VA_BITS

In order for nested translation to work correctly, we need to ensure
that the maximum output address size from stage-1 is <= the maximum
supported input address size to stage-2. The latter is currently defined
by VA_BITS, since we make use of the CPU page table functions for
allocating out tables and so the driver currently enforces this
restriction by truncating the stage-1 output size during probe.

In reality, this doesn't make a lot of sense; the guest OS is responsible
for managing the stage-1 page tables, so we actually just need to ensure
that the ID registers of the virtual SMMU interface only advertise the
supported stage-2 input size.

This patch fixes the problem by treating the stage-1 and stage-2 input
address sizes separately.

Reported-by: Tirumalesh Chalamarla <tchalamarla@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index 482cc9e..27fd3d7 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -24,7 +24,7 @@
  *	- v7/v8 long-descriptor format
  *	- Non-secure access to the SMMU
  *	- 4k and 64k pages, with contiguous pte hints.
- *	- Up to 42-bit addressing (dependent on VA_BITS)
+ *	- Up to 48-bit addressing (dependent on VA_BITS)
  *	- Context fault reporting
  */
 
@@ -375,8 +375,9 @@
 	u32				num_mapping_groups;
 	DECLARE_BITMAP(smr_map, ARM_SMMU_MAX_SMRS);
 
-	unsigned long			input_size;
+	unsigned long			s1_input_size;
 	unsigned long			s1_output_size;
+	unsigned long			s2_input_size;
 	unsigned long			s2_output_size;
 
 	u32				num_global_irqs;
@@ -762,7 +763,7 @@
 			       gr1_base + ARM_SMMU_GR1_CBA2R(cfg->cbndx));
 
 		/* TTBCR2 */
-		switch (smmu->input_size) {
+		switch (smmu->s1_input_size) {
 		case 32:
 			reg = (TTBCR2_ADDR_32 << TTBCR2_SEP_SHIFT);
 			break;
@@ -831,7 +832,7 @@
 			reg = TTBCR_TG0_64K;
 
 		if (!stage1) {
-			reg |= (64 - smmu->s1_output_size) << TTBCR_T0SZ_SHIFT;
+			reg |= (64 - smmu->s2_input_size) << TTBCR_T0SZ_SHIFT;
 
 			switch (smmu->s2_output_size) {
 			case 32:
@@ -854,7 +855,7 @@
 				break;
 			}
 		} else {
-			reg |= (64 - smmu->input_size) << TTBCR_T0SZ_SHIFT;
+			reg |= (64 - smmu->s1_input_size) << TTBCR_T0SZ_SHIFT;
 		}
 	} else {
 		reg = 0;
@@ -1454,9 +1455,11 @@
 
 	if (cfg->cbar == CBAR_TYPE_S2_TRANS) {
 		stage = 2;
+		input_mask = (1ULL << smmu->s2_input_size) - 1;
 		output_mask = (1ULL << smmu->s2_output_size) - 1;
 	} else {
 		stage = 1;
+		input_mask = (1ULL << smmu->s1_input_size) - 1;
 		output_mask = (1ULL << smmu->s1_output_size) - 1;
 	}
 
@@ -1466,7 +1469,6 @@
 	if (size & ~PAGE_MASK)
 		return -EINVAL;
 
-	input_mask = (1ULL << smmu->input_size) - 1;
 	if ((phys_addr_t)iova & ~input_mask)
 		return -ERANGE;
 
@@ -1838,28 +1840,21 @@
 	/* ID2 */
 	id = readl_relaxed(gr0_base + ARM_SMMU_GR0_ID2);
 	size = arm_smmu_id_size_to_bits((id >> ID2_IAS_SHIFT) & ID2_IAS_MASK);
+	smmu->s1_output_size = min_t(unsigned long, PHYS_MASK_SHIFT, size);
 
-	/*
-	 * Stage-1 output limited by stage-2 input size due to pgd
-	 * allocation (PTRS_PER_PGD).
-	 */
-	if (smmu->features & ARM_SMMU_FEAT_TRANS_NESTED) {
+	/* Stage-2 input size limited due to pgd allocation (PTRS_PER_PGD) */
 #ifdef CONFIG_64BIT
-		smmu->s1_output_size = min_t(unsigned long, VA_BITS, size);
+	smmu->s2_input_size = min_t(unsigned long, VA_BITS, size);
 #else
-		smmu->s1_output_size = min(32UL, size);
+	smmu->s2_input_size = min(32UL, size);
 #endif
-	} else {
-		smmu->s1_output_size = min_t(unsigned long, PHYS_MASK_SHIFT,
-					     size);
-	}
 
 	/* The stage-2 output mask is also applied for bypass */
 	size = arm_smmu_id_size_to_bits((id >> ID2_OAS_SHIFT) & ID2_OAS_MASK);
 	smmu->s2_output_size = min_t(unsigned long, PHYS_MASK_SHIFT, size);
 
 	if (smmu->version == 1) {
-		smmu->input_size = 32;
+		smmu->s1_input_size = 32;
 	} else {
 #ifdef CONFIG_64BIT
 		size = (id >> ID2_UBS_SHIFT) & ID2_UBS_MASK;
@@ -1867,7 +1862,7 @@
 #else
 		size = 32;
 #endif
-		smmu->input_size = size;
+		smmu->s1_input_size = size;
 
 		if ((PAGE_SIZE == SZ_4K && !(id & ID2_PTFS_4K)) ||
 		    (PAGE_SIZE == SZ_64K && !(id & ID2_PTFS_64K)) ||
@@ -1878,10 +1873,14 @@
 		}
 	}
 
-	dev_notice(smmu->dev,
-		   "\t%lu-bit VA, %lu-bit IPA, %lu-bit PA\n",
-		   smmu->input_size, smmu->s1_output_size,
-		   smmu->s2_output_size);
+	if (smmu->features & ARM_SMMU_FEAT_TRANS_S1)
+		dev_notice(smmu->dev, "\tStage-1: %lu-bit VA -> %lu-bit IPA\n",
+			   smmu->s1_input_size, smmu->s1_output_size);
+
+	if (smmu->features & ARM_SMMU_FEAT_TRANS_S2)
+		dev_notice(smmu->dev, "\tStage-2: %lu-bit IPA -> %lu-bit PA\n",
+			   smmu->s2_input_size, smmu->s2_output_size);
+
 	return 0;
 }