userfaultfd: change the read API to return a uffd_msg
I had requests to return the full address (not the page aligned one) to
userland.
It's not entirely clear how the page offset could be relevant because
userfaults aren't like SIGBUS that can sigjump to a different place and it
actually skip resolving the fault depending on a page offset. There's
currently no real way to skip the fault especially because after a
UFFDIO_COPY|ZEROPAGE, the fault is optimized to be retried within the
kernel without having to return to userland first (not even self modifying
code replacing the .text that touched the faulting address would prevent
the fault to be repeated). Userland cannot skip repeating the fault even
more so if the fault was triggered by a KVM secondary page fault or any
get_user_pages or any copy-user inside some syscall which will return to
kernel code. The second time FAULT_FLAG_RETRY_NOWAIT won't be set leading
to a SIGBUS being raised because the userfault can't wait if it cannot
release the mmap_map first (and FAULT_FLAG_RETRY_NOWAIT is required for
that).
Still returning userland a proper structure during the read() on the uffd,
can allow to use the current UFFD_API for the future non-cooperative
extensions too and it looks cleaner as well. Once we get additional
fields there's no point to return the fault address page aligned anymore
to reuse the bits below PAGE_SHIFT.
The only downside is that the read() syscall will read 32bytes instead of
8bytes but that's not going to be measurable overhead.
The total number of new events that can be extended or of new future bits
for already shipped events, is limited to 64 by the features field of the
uffdio_api structure. If more will be needed a bump of UFFD_API will be
required.
[akpm@linux-foundation.org: use __packed]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0756d97..1f2ddaa 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,7 +50,7 @@
};
struct userfaultfd_wait_queue {
- unsigned long address;
+ struct uffd_msg msg;
wait_queue_t wq;
bool pending;
struct userfaultfd_ctx *ctx;
@@ -77,7 +77,8 @@
/* len == 0 means wake all */
start = range->start;
len = range->len;
- if (len && (start > uwq->address || start + len <= uwq->address))
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
goto out;
ret = wake_up_state(wq->private, mode);
if (ret)
@@ -135,28 +136,43 @@
}
}
-static inline unsigned long userfault_address(unsigned long address,
- unsigned int flags,
- unsigned long reason)
+static inline void msg_init(struct uffd_msg *msg)
{
- BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS);
- address &= PAGE_MASK;
+ BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ struct uffd_msg msg;
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+ msg.arg.pagefault.address = address;
if (flags & FAULT_FLAG_WRITE)
/*
- * Encode "write" fault information in the LSB of the
- * address read by userland, without depending on
- * FAULT_FLAG_WRITE kernel internal value.
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+ * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+ * was a read fault, otherwise if set it means it's
+ * a write fault.
*/
- address |= UFFD_BIT_WRITE;
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
if (reason & VM_UFFD_WP)
/*
- * Encode "reason" fault information as bit number 1
- * in the address read by userland. If bit number 1 is
- * clear it means the reason is a VM_FAULT_MISSING
- * fault.
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+ * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+ * a missing fault, otherwise if set it means it's a
+ * write protect fault.
*/
- address |= UFFD_BIT_WP;
- return address;
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ return msg;
}
/*
@@ -242,7 +258,7 @@
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.address = userfault_address(address, flags, reason);
+ uwq.msg = userfault_msg(address, flags, reason);
uwq.pending = true;
uwq.ctx = ctx;
@@ -398,7 +414,7 @@
}
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
- __u64 *addr)
+ struct uffd_msg *msg)
{
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
@@ -416,8 +432,8 @@
* disappear from under us.
*/
uwq->pending = false;
- /* careful to always initialize addr if ret == 0 */
- *addr = uwq->address;
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
spin_unlock(&ctx->fault_wqh.lock);
ret = 0;
break;
@@ -447,8 +463,7 @@
{
struct userfaultfd_ctx *ctx = file->private_data;
ssize_t _ret, ret = 0;
- /* careful to always initialize addr if ret == 0 */
- __u64 uninitialized_var(addr);
+ struct uffd_msg msg;
int no_wait = file->f_flags & O_NONBLOCK;
if (ctx->state == UFFD_STATE_WAIT_API)
@@ -456,16 +471,16 @@
BUG_ON(ctx->state != UFFD_STATE_RUNNING);
for (;;) {
- if (count < sizeof(addr))
+ if (count < sizeof(msg))
return ret ? ret : -EINVAL;
- _ret = userfaultfd_ctx_read(ctx, no_wait, &addr);
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
if (_ret < 0)
return ret ? ret : _ret;
- if (put_user(addr, (__u64 __user *) buf))
+ if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
return ret ? ret : -EFAULT;
- ret += sizeof(addr);
- buf += sizeof(addr);
- count -= sizeof(addr);
+ ret += sizeof(msg);
+ buf += sizeof(msg);
+ count -= sizeof(msg);
/*
* Allow to read more than one fault at time but only
* block if waiting for the very first one.
@@ -873,17 +888,15 @@
if (ctx->state != UFFD_STATE_WAIT_API)
goto out;
ret = -EFAULT;
- if (copy_from_user(&uffdio_api, buf, sizeof(__u64)))
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
- if (uffdio_api.api != UFFD_API) {
- /* careful not to leak info, we only read the first 8 bytes */
+ if (uffdio_api.api != UFFD_API || uffdio_api.features) {
memset(&uffdio_api, 0, sizeof(uffdio_api));
if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
goto out;
ret = -EINVAL;
goto out;
}
- /* careful not to leak info, we only read the first 8 bytes */
uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;