drm/vc4: Add support for async pageflips.

An async pageflip stores the modeset to be done and executes it once
the BOs are ready to be displayed.  This gets us about 3x performance
in full screen rendering with pageflipping.

Signed-off-by: Eric Anholt <eric@anholt.net>
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index cdd8b10..db32c373 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -29,6 +29,14 @@
 	u32 *dlist;
 	u32 dlist_size; /* Number of dwords in allocated for the display list */
 	u32 dlist_count; /* Number of used dwords in the display list. */
+
+	/* Offset in the dlist to pointer word 0. */
+	u32 pw0_offset;
+
+	/* Offset where the plane's dlist was last stored in the
+	   hardware at vc4_crtc_atomic_flush() time.
+	*/
+	u32 *hw_dlist;
 };
 
 static inline struct vc4_plane_state *
@@ -197,6 +205,8 @@
 	/* Position Word 3: Context.  Written by the HVS. */
 	vc4_dlist_write(vc4_state, 0xc0c0c0c0);
 
+	vc4_state->pw0_offset = vc4_state->dlist_count;
+
 	/* Pointer Word 0: RGB / Y Pointer */
 	vc4_dlist_write(vc4_state, bo->paddr + offset);
 
@@ -248,6 +258,8 @@
 	struct vc4_plane_state *vc4_state = to_vc4_plane_state(plane->state);
 	int i;
 
+	vc4_state->hw_dlist = dlist;
+
 	/* Can't memcpy_toio() because it needs to be 32-bit writes. */
 	for (i = 0; i < vc4_state->dlist_count; i++)
 		writel(vc4_state->dlist[i], &dlist[i]);
@@ -262,6 +274,34 @@
 	return vc4_state->dlist_count;
 }
 
+/* Updates the plane to immediately (well, once the FIFO needs
+ * refilling) scan out from at a new framebuffer.
+ */
+void vc4_plane_async_set_fb(struct drm_plane *plane, struct drm_framebuffer *fb)
+{
+	struct vc4_plane_state *vc4_state = to_vc4_plane_state(plane->state);
+	struct drm_gem_cma_object *bo = drm_fb_cma_get_gem_obj(fb, 0);
+	uint32_t addr;
+
+	/* We're skipping the address adjustment for negative origin,
+	 * because this is only called on the primary plane.
+	 */
+	WARN_ON_ONCE(plane->state->crtc_x < 0 || plane->state->crtc_y < 0);
+	addr = bo->paddr + fb->offsets[0];
+
+	/* Write the new address into the hardware immediately.  The
+	 * scanout will start from this address as soon as the FIFO
+	 * needs to refill with pixels.
+	 */
+	writel(addr, &vc4_state->hw_dlist[vc4_state->pw0_offset]);
+
+	/* Also update the CPU-side dlist copy, so that any later
+	 * atomic updates that don't do a new modeset on our plane
+	 * also use our updated address.
+	 */
+	vc4_state->dlist[vc4_state->pw0_offset] = addr;
+}
+
 static const struct drm_plane_helper_funcs vc4_plane_helper_funcs = {
 	.prepare_fb = NULL,
 	.cleanup_fb = NULL,