Fix CC collector thread flip and JNI critical section deadlocks.

There's a bug in the synchronization between the CC collector's thread
flip and JNI critical sections where it incorrectly attempts to make
the synchronization scheme to be writer (GC) preference to avoid
starvation of GC in the presence of frequent JNI critical section
enter/exit. This could cause a deadlock between them if a thread
enters a nested JNI critical section after a thread flip occurs. This
is reproduced in the added test.

The fix is to use a thread local disable counter in addition to the
global counter to detect a nested enter by the same thread and avoid
waiting if nested.

Bug: 19235243
Bug: 12687968
Change-Id: Idf7720a6906c9ea508219935af3727f76680d2d8
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 3c9312f..a656fb8 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -845,6 +845,13 @@
 void Heap::IncrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. If thread_flip_running_ is true, block. Otherwise, go ahead.
   CHECK(kUseReadBarrier);
+  bool is_nested = self->GetDisableThreadFlipCount() > 0;
+  self->IncrementDisableThreadFlipCount();
+  if (is_nested) {
+    // If this is a nested JNI critical section enter, we don't need to wait or increment the global
+    // counter. The global counter is incremented only once for a thread for the outermost enter.
+    return;
+  }
   ScopedThreadStateChange tsc(self, kWaitingForGcThreadFlip);
   MutexLock mu(self, *thread_flip_lock_);
   bool has_waited = false;
@@ -867,10 +874,20 @@
   // Supposed to be called by mutators. Decrement disable_thread_flip_count_ and potentially wake up
   // the GC waiting before doing a thread flip.
   CHECK(kUseReadBarrier);
+  self->DecrementDisableThreadFlipCount();
+  bool is_outermost = self->GetDisableThreadFlipCount() == 0;
+  if (!is_outermost) {
+    // If this is not an outermost JNI critical exit, we don't need to decrement the global counter.
+    // The global counter is decremented only once for a thread for the outermost exit.
+    return;
+  }
   MutexLock mu(self, *thread_flip_lock_);
   CHECK_GT(disable_thread_flip_count_, 0U);
   --disable_thread_flip_count_;
-  thread_flip_cond_->Broadcast(self);
+  if (disable_thread_flip_count_ == 0) {
+    // Potentially notify the GC thread blocking to begin a thread flip.
+    thread_flip_cond_->Broadcast(self);
+  }
 }
 
 void Heap::ThreadFlipBegin(Thread* self) {
@@ -882,7 +899,8 @@
   bool has_waited = false;
   uint64_t wait_start = NanoTime();
   CHECK(!thread_flip_running_);
-  // Set this to true before waiting so that a new mutator entering a JNI critical won't starve GC.
+  // Set this to true before waiting so that frequent JNI critical enter/exits won't starve
+  // GC. This like a writer preference of a reader-writer lock.
   thread_flip_running_ = true;
   while (disable_thread_flip_count_ > 0) {
     has_waited = true;
@@ -904,6 +922,7 @@
   MutexLock mu(self, *thread_flip_lock_);
   CHECK(thread_flip_running_);
   thread_flip_running_ = false;
+  // Potentially notify mutator threads blocking to enter a JNI critical section.
   thread_flip_cond_->Broadcast(self);
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index c02e2d3..a181e23 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -1113,6 +1113,8 @@
   // Used to synchronize between JNI critical calls and the thread flip of the CC collector.
   Mutex* thread_flip_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::unique_ptr<ConditionVariable> thread_flip_cond_ GUARDED_BY(thread_flip_lock_);
+  // This counter keeps track of how many threads are currently in a JNI critical section. This is
+  // incremented once per thread even with nested enters.
   size_t disable_thread_flip_count_ GUARDED_BY(thread_flip_lock_);
   bool thread_flip_running_ GUARDED_BY(thread_flip_lock_);
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 2726e91..97c47e1 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -852,6 +852,22 @@
     tls32_.weak_ref_access_enabled = enabled;
   }
 
+  uint32_t GetDisableThreadFlipCount() const {
+    CHECK(kUseReadBarrier);
+    return tls32_.disable_thread_flip_count;
+  }
+
+  void IncrementDisableThreadFlipCount() {
+    CHECK(kUseReadBarrier);
+    ++tls32_.disable_thread_flip_count;
+  }
+
+  void DecrementDisableThreadFlipCount() {
+    CHECK(kUseReadBarrier);
+    DCHECK_GT(tls32_.disable_thread_flip_count, 0U);
+    --tls32_.disable_thread_flip_count;
+  }
+
   // Activates single step control for debugging. The thread takes the
   // ownership of the given SingleStepControl*. It is deleted by a call
   // to DeactivateSingleStepControl or upon thread destruction.
@@ -1214,7 +1230,8 @@
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
       thread_exit_check_count(0), handling_signal_(false),
       suspended_at_suspend_check(false), ready_for_debug_invoke(false),
-      debug_method_entry_(false), is_gc_marking(false), weak_ref_access_enabled(true) {
+      debug_method_entry_(false), is_gc_marking(false), weak_ref_access_enabled(true),
+      disable_thread_flip_count(0) {
     }
 
     union StateAndFlags state_and_flags;
@@ -1281,6 +1298,11 @@
     // pause, this is not an issue.) Other collectors use Runtime::DisallowNewSystemWeaks() and
     // ReferenceProcessor::EnableSlowPath().
     bool32_t weak_ref_access_enabled;
+
+    // A thread local version of Heap::disable_thread_flip_count_. This keeps track of how many
+    // levels of (nested) JNI critical sections the thread is in and is used to detect a nested JNI
+    // critical section enter.
+    uint32_t disable_thread_flip_count;
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {