Use partial TLAB regions Instead of having 256K TLAB regions, have 256K TLABs split into 16K regions. This fixes pathological cases with multithreaded allocation that caused many GCs since each thread reserving 256K would often bump the counter past the GC start threshold. Now threads only bump the counter every 16K. System wide results (average of 5 samples on N6P): Total GC time 60s after starting shell: 45s -> 24s Average .Heap PSS 60s after starting shell: 57900k -> 58682k BinaryTrees gets around 5% slower, numbers are noisy. Boot time: 13.302 -> 12.899 (average of 100 runs) Bug: 35872915 Bug: 36216292 Test: test-art-host (cherry picked from commit bf48003fa32d2845f2213c0ba31af6677715662d) Change-Id: I5ab22420124eeadc0a53519c70112274101dfb39

commit: 6bc7774426cc0b6bbab5566fa62b3c509455e583 [log] [tgz]
author: Mathieu Chartier <mathieuc@google.com> Tue Apr 18 17:46:23 2017 -0700
committer: Mathieu Chartier <mathieuc@google.com> Wed Apr 19 21:26:56 2017 -0700
tree: 06c47a48c43924e8cdc80ed3ec31b8fddb4b39b6
parent: 8d0f3aaf28358697ec812955cdf975ca6c6ff901 [diff]
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 4a2e34f..6d271ed 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h

@@ -98,7 +98,7 @@
 ADD_TEST_EQ(THREAD_LOCAL_END_OFFSET,
             art::Thread::ThreadLocalEndOffset<POINTER_SIZE>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_objects.
-#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_END_OFFSET + __SIZEOF_POINTER__)
+#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_END_OFFSET + 2 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
             art::Thread::ThreadLocalObjectsOffset<POINTER_SIZE>().Int32Value())
 

diff --git a/runtime/gc/allocator_type.h b/runtime/gc/allocator_type.h
index 185a9b7..2f1f577 100644
--- a/runtime/gc/allocator_type.h
+++ b/runtime/gc/allocator_type.h

@@ -35,6 +35,10 @@
 };
 std::ostream& operator<<(std::ostream& os, const AllocatorType& rhs);
 
+inline constexpr bool IsTLABAllocator(AllocatorType allocator) {
+  return allocator == kAllocatorTypeTLAB || allocator == kAllocatorTypeRegionTLAB;
+}
+
 }  // namespace gc
 }  // namespace art
 

diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 394e541..a50d125 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h

@@ -77,12 +77,11 @@
   size_t bytes_allocated;
   size_t usable_size;
   size_t new_num_bytes_allocated = 0;
-  if (allocator == kAllocatorTypeTLAB || allocator == kAllocatorTypeRegionTLAB) {
+  if (IsTLABAllocator(allocator)) {
     byte_count = RoundUp(byte_count, space::BumpPointerSpace::kAlignment);
   }
   // If we have a thread local allocation we don't need to update bytes allocated.
-  if ((allocator == kAllocatorTypeTLAB || allocator == kAllocatorTypeRegionTLAB) &&
-      byte_count <= self->TlabSize()) {
+  if (IsTLABAllocator(allocator) && byte_count <= self->TlabSize()) {
     obj = self->AllocTlab(byte_count);
     DCHECK(obj != nullptr) << "AllocTlab can't fail";
     obj->SetClass(klass);

diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 4a25610..28dd627 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc

@@ -135,6 +135,13 @@
 
 static const char* kRegionSpaceName = "main space (region space)";
 
+// If true, we log all GCs in the both the foreground and background. Used for debugging.
+static constexpr bool kLogAllGCs = false;
+
+// How much we grow the TLAB if we can do it.
+static constexpr size_t kPartialTlabSize = 16 * KB;
+static constexpr bool kUsePartialTlabs = true;
+
 #if defined(__LP64__) || !defined(ADDRESS_SANITIZER)
 // 300 MB (0x12c00000) - (default non-moving space capacity).
 static uint8_t* const kPreferredAllocSpaceBegin =
@@ -2762,7 +2769,7 @@
   const std::vector<uint64_t>& pause_times = GetCurrentGcIteration()->GetPauseTimes();
   // Print the GC if it is an explicit GC (e.g. Runtime.gc()) or a slow GC
   // (mutator time blocked >= long_pause_log_threshold_).
-  bool log_gc = gc_cause == kGcCauseExplicit;
+  bool log_gc = kLogAllGCs || gc_cause == kGcCauseExplicit;
   if (!log_gc && CareAboutPauseTimes()) {
     // GC for alloc pauses the allocating thread, so consider it as a pause.
     log_gc = duration > long_gc_log_threshold_ ||
@@ -4182,7 +4189,21 @@
                                        size_t* usable_size,
                                        size_t* bytes_tl_bulk_allocated) {
   const AllocatorType allocator_type = GetCurrentAllocator();
-  if (allocator_type == kAllocatorTypeTLAB) {
+  if (kUsePartialTlabs && alloc_size <= self->TlabRemainingCapacity()) {
+    DCHECK_GT(alloc_size, self->TlabSize());
+    // There is enough space if we grow the TLAB. Lets do that. This increases the
+    // TLAB bytes.
+    const size_t min_expand_size = alloc_size - self->TlabSize();
+    const size_t expand_bytes = std::max(
+        min_expand_size,
+        std::min(self->TlabRemainingCapacity() - self->TlabSize(), kPartialTlabSize));
+    if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, expand_bytes, grow))) {
+      return nullptr;
+    }
+    *bytes_tl_bulk_allocated = expand_bytes;
+    self->ExpandTlab(expand_bytes);
+    DCHECK_LE(alloc_size, self->TlabSize());
+  } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
     const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
     if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) {
@@ -4202,15 +4223,18 @@
       if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type,
                                             space::RegionSpace::kRegionSize,
                                             grow))) {
+        const size_t new_tlab_size = kUsePartialTlabs
+            ? std::max(alloc_size, kPartialTlabSize)
+            : gc::space::RegionSpace::kRegionSize;
         // Try to allocate a tlab.
-        if (!region_space_->AllocNewTlab(self)) {
+        if (!region_space_->AllocNewTlab(self, new_tlab_size)) {
           // Failed to allocate a tlab. Try non-tlab.
           return region_space_->AllocNonvirtual<false>(alloc_size,
                                                        bytes_allocated,
                                                        usable_size,
                                                        bytes_tl_bulk_allocated);
         }
-        *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
+        *bytes_tl_bulk_allocated = new_tlab_size;
         // Fall-through to using the TLAB below.
       } else {
         // Check OOME for a non-tlab allocation.

diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 1303d77..426b332 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc

@@ -249,7 +249,7 @@
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
   objects_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalObjectsAllocated());
   bytes_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalBytesAllocated());
-  thread->SetTlab(nullptr, nullptr);
+  thread->SetTlab(nullptr, nullptr, nullptr);
 }
 
 bool BumpPointerSpace::AllocNewTlab(Thread* self, size_t bytes) {
@@ -259,7 +259,7 @@
   if (start == nullptr) {
     return false;
   }
-  self->SetTlab(start, start + bytes);
+  self->SetTlab(start, start + bytes, start + bytes);
   return true;
 }
 

diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index 5809027..3910a03 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h

@@ -18,6 +18,7 @@
 #define ART_RUNTIME_GC_SPACE_REGION_SPACE_INL_H_
 
 #include "region_space.h"
+#include "thread-inl.h"
 
 namespace art {
 namespace gc {
@@ -335,6 +336,28 @@
   return nullptr;
 }
 
+inline size_t RegionSpace::Region::BytesAllocated() const {
+  if (IsLarge()) {
+    DCHECK_LT(begin_ + kRegionSize, Top());
+    return static_cast<size_t>(Top() - begin_);
+  } else if (IsLargeTail()) {
+    DCHECK_EQ(begin_, Top());
+    return 0;
+  } else {
+    DCHECK(IsAllocated()) << static_cast<uint>(state_);
+    DCHECK_LE(begin_, Top());
+    size_t bytes;
+    if (is_a_tlab_) {
+      bytes = thread_->GetThreadLocalBytesAllocated();
+    } else {
+      bytes = static_cast<size_t>(Top() - begin_);
+    }
+    DCHECK_LE(bytes, kRegionSize);
+    return bytes;
+  }
+}
+
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art

diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 1ad4843..09b4a3a 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc

@@ -427,7 +427,7 @@
   r->objects_allocated_.FetchAndAddSequentiallyConsistent(1);
 }
 
-bool RegionSpace::AllocNewTlab(Thread* self) {
+bool RegionSpace::AllocNewTlab(Thread* self, size_t min_bytes) {
   MutexLock mu(self, region_lock_);
   RevokeThreadLocalBuffersLocked(self);
   // Retain sufficient free regions for full evacuation.
@@ -443,7 +443,7 @@
       r->SetTop(r->End());
       r->is_a_tlab_ = true;
       r->thread_ = self;
-      self->SetTlab(r->Begin(), r->End());
+      self->SetTlab(r->Begin(), r->Begin() + min_bytes, r->End());
       return true;
     }
   }
@@ -463,13 +463,13 @@
     DCHECK_ALIGNED(tlab_start, kRegionSize);
     Region* r = RefToRegionLocked(reinterpret_cast<mirror::Object*>(tlab_start));
     DCHECK(r->IsAllocated());
-    DCHECK_EQ(thread->GetThreadLocalBytesAllocated(), kRegionSize);
+    DCHECK_LE(thread->GetThreadLocalBytesAllocated(), kRegionSize);
     r->RecordThreadLocalAllocations(thread->GetThreadLocalObjectsAllocated(),
                                     thread->GetThreadLocalBytesAllocated());
     r->is_a_tlab_ = false;
     r->thread_ = nullptr;
   }
-  thread->SetTlab(nullptr, nullptr);
+  thread->SetTlab(nullptr, nullptr, nullptr);
 }
 
 size_t RegionSpace::RevokeAllThreadLocalBuffers() {

diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 2537929..80eecca 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h

@@ -234,7 +234,7 @@
   }
 
   void RecordAlloc(mirror::Object* ref) REQUIRES(!region_lock_);
-  bool AllocNewTlab(Thread* self) REQUIRES(!region_lock_);
+  bool AllocNewTlab(Thread* self, size_t min_bytes) REQUIRES(!region_lock_);
 
   uint32_t Time() {
     return time_;
@@ -417,21 +417,7 @@
       return live_bytes_;
     }
 
-    size_t BytesAllocated() const {
-      if (IsLarge()) {
-        DCHECK_LT(begin_ + kRegionSize, Top());
-        return static_cast<size_t>(Top() - begin_);
-      } else if (IsLargeTail()) {
-        DCHECK_EQ(begin_, Top());
-        return 0;
-      } else {
-        DCHECK(IsAllocated()) << static_cast<uint>(state_);
-        DCHECK_LE(begin_, Top());
-        size_t bytes = static_cast<size_t>(Top() - begin_);
-        DCHECK_LE(bytes, kRegionSize);
-        return bytes;
-      }
-    }
+    size_t BytesAllocated() const;
 
     size_t ObjectsAllocated() const {
       if (IsLarge()) {
@@ -476,7 +462,7 @@
       DCHECK_EQ(Top(), end_);
       objects_allocated_.StoreRelaxed(num_objects);
       top_.StoreRelaxed(begin_ + num_bytes);
-      DCHECK_EQ(Top(), end_);
+      DCHECK_LE(Top(), end_);
     }
 
    private:

diff --git a/runtime/oat.h b/runtime/oat.h
index 58ea91b..05706252 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h

@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '1', '8', '\0' };  // ARM64 Read barriers thunks.
+  static constexpr uint8_t kOatVersion[] = { '1', '1', '9', '\0' };  // Add thread_local_limit.
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";

diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 02a1e4d..aa769fa 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h

@@ -303,10 +303,6 @@
   }
 }
 
-inline size_t Thread::TlabSize() const {
-  return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
-}
-
 inline mirror::Object* Thread::AllocTlab(size_t bytes) {
   DCHECK_GE(TlabSize(), bytes);
   ++tlsPtr_.thread_local_objects;

diff --git a/runtime/thread.cc b/runtime/thread.cc
index a8a03c7..f887aaa 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc

@@ -3468,11 +3468,13 @@
   }
 }
 
-void Thread::SetTlab(uint8_t* start, uint8_t* end) {
+void Thread::SetTlab(uint8_t* start, uint8_t* end, uint8_t* limit) {
   DCHECK_LE(start, end);
+  DCHECK_LE(end, limit);
   tlsPtr_.thread_local_start = start;
   tlsPtr_.thread_local_pos  = tlsPtr_.thread_local_start;
   tlsPtr_.thread_local_end = end;
+  tlsPtr_.thread_local_limit = limit;
   tlsPtr_.thread_local_objects = 0;
 }
 

diff --git a/runtime/thread.h b/runtime/thread.h
index 4d5d644..5251012 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h

@@ -1035,10 +1035,24 @@
   void ResetQuickAllocEntryPointsForThread(bool is_marking);
 
   // Returns the remaining space in the TLAB.
-  size_t TlabSize() const;
+  size_t TlabSize() const {
+    return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
+  }
+
+  // Returns the remaining space in the TLAB if we were to expand it to maximum capacity.
+  size_t TlabRemainingCapacity() const {
+    return tlsPtr_.thread_local_limit - tlsPtr_.thread_local_pos;
+  }
+
+  // Expand the TLAB by a fixed number of bytes. There must be enough capacity to do so.
+  void ExpandTlab(size_t bytes) {
+    tlsPtr_.thread_local_end += bytes;
+    DCHECK_LE(tlsPtr_.thread_local_end, tlsPtr_.thread_local_limit);
+  }
+
   // Doesn't check that there is room.
   mirror::Object* AllocTlab(size_t bytes);
-  void SetTlab(uint8_t* start, uint8_t* end);
+  void SetTlab(uint8_t* start, uint8_t* end, uint8_t* limit);
   bool HasTlab() const;
   uint8_t* GetTlabStart() {
     return tlsPtr_.thread_local_start;
@@ -1451,6 +1465,7 @@
       frame_id_to_shadow_frame(nullptr), name(nullptr), pthread_self(0),
       last_no_thread_suspension_cause(nullptr), checkpoint_function(nullptr),
       thread_local_start(nullptr), thread_local_pos(nullptr), thread_local_end(nullptr),
+      thread_local_limit(nullptr),
       thread_local_objects(0), mterp_current_ibase(nullptr), mterp_default_ibase(nullptr),
       mterp_alt_ibase(nullptr), thread_local_alloc_stack_top(nullptr),
       thread_local_alloc_stack_end(nullptr),
@@ -1577,6 +1592,10 @@
     uint8_t* thread_local_pos;
     uint8_t* thread_local_end;
 
+    // Thread local limit is how much we can expand the thread local buffer to, it is greater or
+    // equal to thread_local_end.
+    uint8_t* thread_local_limit;
+
     size_t thread_local_objects;
 
     // Entrypoint function pointers.
commit	6bc7774426cc0b6bbab5566fa62b3c509455e583	[log] [tgz]
author	Mathieu Chartier <mathieuc@google.com>	Tue Apr 18 17:46:23 2017 -0700
committer	Mathieu Chartier <mathieuc@google.com>	Wed Apr 19 21:26:56 2017 -0700
tree	06c47a48c43924e8cdc80ed3ec31b8fddb4b39b6
parent	8d0f3aaf28358697ec812955cdf975ca6c6ff901 [diff]