Rewrite thread root flip synchronization.

Mark pending "flip function" with a `ThreadFlag` for faster
JNI transitions.

Use two more `ThreadFlag`s to fix potential race conditions.
Some checkpoints were previously running the flip function
on behalf of a suspended thread because they relied on the
thread roots being flipped, but doing that without any
synchronization meant that they could race and one could
have executed its own code while another was still running
the flip function. Other checkpoints that were peforming
a stack walk did not run the flip function at all, so they
could have seen from-space references. We now check for a
pending or running flip function at the start of the
`Thread::RunCheckPointFunction()` and proceed only after
it has completed; holding a mutator lock for the duration
of the whole function prevents a new flip function from
being installed until the checkpoint finishes.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal       46.581 46.813 (+0.4980%)
NativeDowncallStaticNormal6      42.247 42.268 (+0.0497%)
NativeDowncallStaticNormalRefs6  40.918 41.355 (+1.068%)
NativeDowncallVirtualNormal      46.292 46.361 (+0.1496%)
NativeDowncallVirtualNormal6     41.791 41.791 (0%)
NativeDowncallVirtualNormalRefs6 40.500 40.500 (0%)
linux-x64                        before after
NativeDowncallStaticNormal       44.169 43.956 (-0.4815%)
NativeDowncallStaticNormal6      43.198 43.198 (0%)
NativeDowncallStaticNormalRefs6  38.481 38.481 (0%)
NativeDowncallVirtualNormal      43.672 43.672 (0%)
NativeDowncallVirtualNormal6     42.247 42.268 (+0.0479%)
NativeDowncallVirtualNormalRefs6 41.355 41.355 (0%)
linux-armv7                      before after
NativeDowncallStaticNormal       9.9701 10.443 (+4.739%)
NativeDowncallStaticNormal6      9.2457 9.6525 (+4.400%)
NativeDowncallStaticNormalRefs6  8.3868 8.7209 (+3.984%)
NativeDowncallVirtualNormal      9.8377 10.304 (+4.742%)
NativeDowncallVirtualNormal6     9.3596 9.7752 (+4.440%)
NativeDowncallVirtualNormalRefs6 8.4367 8.7719 (+3.973%)
linux-armv8                      before after
NativeDowncallStaticNormal       9.8571 10.685 (+8.397%)
NativeDowncallStaticNormal6      9.4905 10.249 (+7.991%)
NativeDowncallStaticNormalRefs6  8.6705 9.3000 (+7.261%)
NativeDowncallVirtualNormal      9.3183 10.053 (+7.881%)
NativeDowncallVirtualNormal6     9.2638 9.9850 (+7.786%)
NativeDowncallVirtualNormalRefs6 8.2967 8.8714 (+6.926%)
(The x86 and x86-64 differences seem to be lost in noise.)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 172332525
Change-Id: I9c2227142010f7fe6ecf07e92273bc65d728c5c6
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 67e2e6a..960a870 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -49,7 +49,7 @@
 inline void Thread::CheckSuspend() {
   DCHECK_EQ(Thread::Current(), this);
   while (true) {
-    StateAndFlags state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
+    StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     if (LIKELY(!state_and_flags.IsAnyOfFlagsSet(SuspendOrCheckpointRequestFlags()))) {
       break;
     } else if (state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest)) {
@@ -113,11 +113,10 @@
   }
 
   while (true) {
-    StateAndFlags old_state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
+    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     CHECK_NE(old_state_and_flags.GetState(), ThreadState::kRunnable)
         << new_state << " " << *this << " " << *Thread::Current();
-    StateAndFlags new_state_and_flags = old_state_and_flags;
-    new_state_and_flags.SetState(new_state);
+    StateAndFlags new_state_and_flags = old_state_and_flags.WithState(new_state);
     bool done =
         tls32_.state_and_flags.CompareAndSetWeakRelaxed(old_state_and_flags.GetValue(),
                                                         new_state_and_flags.GetValue());
@@ -191,7 +190,7 @@
 inline void Thread::TransitionToSuspendedAndRunCheckpoints(ThreadState new_state) {
   DCHECK_NE(new_state, ThreadState::kRunnable);
   while (true) {
-    StateAndFlags old_state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
+    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     DCHECK_EQ(old_state_and_flags.GetState(), ThreadState::kRunnable);
     if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest))) {
       RunCheckpointFunction();
@@ -204,8 +203,7 @@
     // Change the state but keep the current flags (kCheckpointRequest is clear).
     DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest));
     DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest));
-    StateAndFlags new_state_and_flags = old_state_and_flags;
-    new_state_and_flags.SetState(new_state);
+    StateAndFlags new_state_and_flags = old_state_and_flags.WithState(new_state);
 
     // CAS the value, ensuring that prior memory operations are visible to any thread
     // that observes that we are suspended.
@@ -220,7 +218,7 @@
 
 inline void Thread::PassActiveSuspendBarriers() {
   while (true) {
-    StateAndFlags state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
+    StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     if (LIKELY(!state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) &&
                !state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest) &&
                !state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier))) {
@@ -253,7 +251,7 @@
 }
 
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
-  StateAndFlags old_state_and_flags(tls32_.state_and_flags.load(std::memory_order_relaxed));
+  StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
   ThreadState old_state = old_state_and_flags.GetState();
   DCHECK_NE(old_state, ThreadState::kRunnable);
   while (true) {
@@ -261,11 +259,12 @@
     // Optimize for the return from native code case - this is the fast path.
     // Atomically change from suspended to runnable if no suspend request pending.
     constexpr uint32_t kCheckedFlags =
-        SuspendOrCheckpointRequestFlags() | enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier);
+        SuspendOrCheckpointRequestFlags() |
+        enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier) |
+        FlipFunctionFlags();
     if (LIKELY(!old_state_and_flags.IsAnyOfFlagsSet(kCheckedFlags))) {
       // CAS the value with a memory barrier.
-      StateAndFlags new_state_and_flags = old_state_and_flags;
-      new_state_and_flags.SetState(ThreadState::kRunnable);
+      StateAndFlags new_state_and_flags = old_state_and_flags.WithState(ThreadState::kRunnable);
       if (LIKELY(tls32_.state_and_flags.CompareAndSetWeakAcquire(old_state_and_flags.GetValue(),
                                                                  new_state_and_flags.GetValue()))) {
         // Mark the acquisition of a share of the mutator lock.
@@ -276,15 +275,13 @@
       PassActiveSuspendBarriers(this);
     } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) ||
                         old_state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest))) {
-      // Impossible
-      StateAndFlags flags = old_state_and_flags;
+      // Checkpoint flags should not be set while in suspended state.
       static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
-      flags.SetState(ThreadState::kRunnable);  // Note: Keeping unused bits.
-      LOG(FATAL) << "Transitioning to runnable with checkpoint flag, "
-                 << " flags=" << flags.GetValue()  // State set to kRunnable = 0.
+      LOG(FATAL) << "Transitioning to Runnable with checkpoint flag,"
+                 // Note: Keeping unused flags. If they are set, it points to memory corruption.
+                 << " flags=" << old_state_and_flags.WithState(ThreadState::kRunnable).GetValue()
                  << " state=" << old_state_and_flags.GetState();
-    } else {
-      DCHECK(old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest));
+    } else if (old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest)) {
       // Wait while our suspend count is non-zero.
 
       // We pass null to the MutexLock as we may be in a situation where the
@@ -299,26 +296,44 @@
       MutexLock mu(thread_to_pass, *Locks::thread_suspend_count_lock_);
       ScopedTransitioningToRunnable scoped_transitioning_to_runnable(this);
       // Reload state and flags after locking the mutex.
-      old_state_and_flags.SetValue(tls32_.state_and_flags.load(std::memory_order_relaxed));
+      old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
       DCHECK_EQ(old_state, old_state_and_flags.GetState());
       while (old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest)) {
         // Re-check when Thread::resume_cond_ is notified.
         Thread::resume_cond_->Wait(thread_to_pass);
         // Reload state and flags after waiting.
-        old_state_and_flags.SetValue(tls32_.state_and_flags.load(std::memory_order_relaxed));
+        old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
         DCHECK_EQ(old_state, old_state_and_flags.GetState());
       }
       DCHECK_EQ(GetSuspendCount(), 0);
+    } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) ||
+               UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kWaitingForFlipFunction))) {
+      // The thread should be suspended while another thread is running the flip function.
+      static_assert(static_cast<std::underlying_type_t<ThreadState>>(ThreadState::kRunnable) == 0u);
+      LOG(FATAL) << "Transitioning to Runnable while another thread is running the flip function,"
+                 // Note: Keeping unused flags. If they are set, it points to memory corruption.
+                 << " flags=" << old_state_and_flags.WithState(ThreadState::kRunnable).GetValue()
+                 << " state=" << old_state_and_flags.GetState();
+    } else {
+      DCHECK(old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction));
+      // CAS the value with a memory barrier.
+      // Do not set `ThreadFlag::kRunningFlipFunction` as no other thread can run
+      // the flip function for a thread that is not suspended.
+      StateAndFlags new_state_and_flags = old_state_and_flags.WithState(ThreadState::kRunnable)
+          .WithoutFlag(ThreadFlag::kPendingFlipFunction);
+      if (LIKELY(tls32_.state_and_flags.CompareAndSetWeakAcquire(old_state_and_flags.GetValue(),
+                                                                 new_state_and_flags.GetValue()))) {
+        // Mark the acquisition of a share of the mutator lock.
+        GetMutatorLock()->TransitionFromSuspendedToRunnable(this);
+        // Run the flip function.
+        RunFlipFunction(this, /*notify=*/ false);
+        break;
+      }
     }
     // Reload state and flags.
-    old_state_and_flags.SetValue(tls32_.state_and_flags.load(std::memory_order_relaxed));
+    old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     DCHECK_EQ(old_state, old_state_and_flags.GetState());
   }
-  // Run the flip function, if set.
-  Closure* flip_func = GetFlipFunction();
-  if (flip_func != nullptr) {
-    flip_func->Run(this);
-  }
   return static_cast<ThreadState>(old_state);
 }