Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 7534515..a41d65c 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -156,6 +156,7 @@
       total_objects_freed_ever_(0),
       num_bytes_allocated_(0),
       native_bytes_allocated_(0),
+      num_bytes_freed_revoke_(0),
       verify_missing_card_marks_(false),
       verify_system_weaks_(false),
       verify_pre_gc_heap_(verify_pre_gc_heap),
@@ -1344,6 +1345,19 @@
   }
 }
 
+void Heap::RecordFreeRevoke() {
+  // Subtract num_bytes_freed_revoke_ from num_bytes_allocated_ to cancel out the
+  // the ahead-of-time, bulk counting of bytes allocated in rosalloc thread-local buffers.
+  // If there's a concurrent revoke, ok to not necessarily reset num_bytes_freed_revoke_
+  // all the way to zero exactly as the remainder will be subtracted at the next GC.
+  size_t bytes_freed = num_bytes_freed_revoke_.LoadSequentiallyConsistent();
+  CHECK_GE(num_bytes_freed_revoke_.FetchAndSubSequentiallyConsistent(bytes_freed),
+           bytes_freed) << "num_bytes_freed_revoke_ underflow";
+  CHECK_GE(num_bytes_allocated_.FetchAndSubSequentiallyConsistent(bytes_freed),
+           bytes_freed) << "num_bytes_allocated_ underflow";
+  GetCurrentGcIteration()->SetFreedRevoke(bytes_freed);
+}
+
 space::RosAllocSpace* Heap::GetRosAllocSpace(gc::allocator::RosAlloc* rosalloc) const {
   for (const auto& space : continuous_spaces_) {
     if (space->AsContinuousSpace()->IsRosAllocSpace()) {
@@ -1358,6 +1372,7 @@
 mirror::Object* Heap::AllocateInternalWithGc(Thread* self, AllocatorType allocator,
                                              size_t alloc_size, size_t* bytes_allocated,
                                              size_t* usable_size,
+                                             size_t* bytes_tl_bulk_allocated,
                                              mirror::Class** klass) {
   bool was_default_allocator = allocator == GetCurrentAllocator();
   // Make sure there is no pending exception since we may need to throw an OOME.
@@ -1377,7 +1392,7 @@
     }
     // A GC was in progress and we blocked, retry allocation now that memory has been freed.
     mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
-                                                     usable_size);
+                                                     usable_size, bytes_tl_bulk_allocated);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -1391,7 +1406,7 @@
   }
   if (gc_ran) {
     mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
-                                                     usable_size);
+                                                     usable_size, bytes_tl_bulk_allocated);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -1411,7 +1426,7 @@
     if (plan_gc_ran) {
       // Did we free sufficient memory for the allocation to succeed?
       mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
-                                                       usable_size);
+                                                       usable_size, bytes_tl_bulk_allocated);
       if (ptr != nullptr) {
         return ptr;
       }
@@ -1420,7 +1435,7 @@
   // Allocations have failed after GCs;  this is an exceptional state.
   // Try harder, growing the heap if necessary.
   mirror::Object* ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated,
-                                                  usable_size);
+                                                  usable_size, bytes_tl_bulk_allocated);
   if (ptr != nullptr) {
     return ptr;
   }
@@ -1437,7 +1452,8 @@
   if (was_default_allocator && allocator != GetCurrentAllocator()) {
     return nullptr;
   }
-  ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated, usable_size);
+  ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated, usable_size,
+                                  bytes_tl_bulk_allocated);
   if (ptr == nullptr) {
     const uint64_t current_time = NanoTime();
     switch (allocator) {
@@ -1453,7 +1469,7 @@
             case HomogeneousSpaceCompactResult::kSuccess:
               // If the allocation succeeded, we delayed an oom.
               ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated,
-                                              usable_size);
+                                              usable_size, bytes_tl_bulk_allocated);
               if (ptr != nullptr) {
                 count_delayed_oom_++;
               }
@@ -1498,7 +1514,7 @@
           } else {
             LOG(WARNING) << "Disabled moving GC due to the non moving space being full";
             ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated,
-                                            usable_size);
+                                            usable_size, bytes_tl_bulk_allocated);
           }
         }
         break;
@@ -1984,8 +2000,8 @@
     if (it == bins_.end()) {
       // No available space in the bins, place it in the target space instead (grows the zygote
       // space).
-      size_t bytes_allocated;
-      forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr);
+      size_t bytes_allocated, dummy;
+      forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated, nullptr, &dummy);
       if (to_space_live_bitmap_ != nullptr) {
         to_space_live_bitmap_->Set(forward_address);
       } else {
@@ -3084,7 +3100,8 @@
     SetIdealFootprint(target_size);
     if (IsGcConcurrent()) {
       const uint64_t freed_bytes = current_gc_iteration_.GetFreedBytes() +
-          current_gc_iteration_.GetFreedLargeObjectBytes();
+          current_gc_iteration_.GetFreedLargeObjectBytes() +
+          current_gc_iteration_.GetFreedRevokeBytes();
       // Bytes allocated will shrink by freed_bytes after the GC runs, so if we want to figure out
       // how many bytes were allocated during the GC we need to add freed_bytes back on.
       CHECK_GE(bytes_allocated + freed_bytes, bytes_allocated_before_gc);
@@ -3290,31 +3307,43 @@
 
 void Heap::RevokeThreadLocalBuffers(Thread* thread) {
   if (rosalloc_space_ != nullptr) {
-    rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    size_t freed_bytes_revoke = rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    if (freed_bytes_revoke > 0U) {
+      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
+      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+    }
   }
   if (bump_pointer_space_ != nullptr) {
-    bump_pointer_space_->RevokeThreadLocalBuffers(thread);
+    CHECK_EQ(bump_pointer_space_->RevokeThreadLocalBuffers(thread), 0U);
   }
   if (region_space_ != nullptr) {
-    region_space_->RevokeThreadLocalBuffers(thread);
+    CHECK_EQ(region_space_->RevokeThreadLocalBuffers(thread), 0U);
   }
 }
 
 void Heap::RevokeRosAllocThreadLocalBuffers(Thread* thread) {
   if (rosalloc_space_ != nullptr) {
-    rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    size_t freed_bytes_revoke = rosalloc_space_->RevokeThreadLocalBuffers(thread);
+    if (freed_bytes_revoke > 0U) {
+      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
+      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+    }
   }
 }
 
 void Heap::RevokeAllThreadLocalBuffers() {
   if (rosalloc_space_ != nullptr) {
-    rosalloc_space_->RevokeAllThreadLocalBuffers();
+    size_t freed_bytes_revoke = rosalloc_space_->RevokeAllThreadLocalBuffers();
+    if (freed_bytes_revoke > 0U) {
+      num_bytes_freed_revoke_.FetchAndAddSequentiallyConsistent(freed_bytes_revoke);
+      CHECK_GE(num_bytes_allocated_.LoadRelaxed(), num_bytes_freed_revoke_.LoadRelaxed());
+    }
   }
   if (bump_pointer_space_ != nullptr) {
-    bump_pointer_space_->RevokeAllThreadLocalBuffers();
+    CHECK_EQ(bump_pointer_space_->RevokeAllThreadLocalBuffers(), 0U);
   }
   if (region_space_ != nullptr) {
-    region_space_->RevokeAllThreadLocalBuffers();
+    CHECK_EQ(region_space_->RevokeAllThreadLocalBuffers(), 0U);
   }
 }