Thread local bump pointer allocator.

Added a thread local allocator to the heap, each thread has three
pointers which specify the thread local buffer: start, cur, and
end. When the remaining space in the thread local buffer isn't large
enough for the allocation, the allocator allocates a new thread
local buffer using the bump pointer allocator.

The bump pointer space had to be modified to accomodate thread
local buffers. These buffers are called "blocks", where a block
is a buffer which contains a set of adjacent objects. Blocks
aren't necessarily full and may have wasted memory towards the
end. Blocks have an 8 byte header which specifies their size and is
required for traversing bump pointer spaces.

Memory usage is in between full bump pointer and ROSAlloc since
madvised memory limits wasted ram to an average of 1/2 page per
block.

Added a runtime option -XX:UseTLAB which specifies whether or
not to use the thread local allocator. Its a NOP if the garbage
collector is not the semispace collector.

TODO: Smarter block accounting to prevent us reading objects until
we either hit the end of the block or GetClass() == null which
signifies that the block isn't 100% full. This would provide a
slight speedup to BumpPointerSpace::Walk.

Timings: -XX:HeapMinFree=4m -XX:HeapMaxFree=8m -Xmx48m
ritzperf memalloc:
Dalvik -Xgc:concurrent: 11678
Dalvik -Xgc:noconcurrent: 6697
-Xgc:MS: 5978
-Xgc:SS: 4271
-Xgc:CMS: 4150
-Xgc:SS -XX:UseTLAB: 3255

Bug: 9986565
Bug: 12042213

Change-Id: Ib7e1d4b199a8199f3b1de94b0a7b6e1730689cad
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 99f084a..9fb5760 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -32,7 +32,7 @@
 namespace art {
 namespace gc {
 
-template <bool kInstrumented, typename PreFenceVisitor>
+template <bool kInstrumented, bool kCheckLargeObject, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Class* klass,
                                                       size_t byte_count, AllocatorType allocator,
                                                       const PreFenceVisitor& pre_fence_visitor) {
@@ -43,13 +43,13 @@
   self->AssertThreadSuspensionIsAllowable();
   // Need to check that we arent the large object allocator since the large object allocation code
   // path this function. If we didn't check we would have an infinite loop.
-  if (allocator != kAllocatorTypeLOS && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
+  if (kCheckLargeObject && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
     return AllocLargeObject<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
                                                             pre_fence_visitor);
   }
   mirror::Object* obj;
-  size_t bytes_allocated;
   AllocationTimer alloc_timer(this, &obj);
+  size_t bytes_allocated;
   obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated);
   if (UNLIKELY(obj == nullptr)) {
     obj = AllocateInternalWithGc(self, allocator, byte_count, &bytes_allocated, &klass);
@@ -89,7 +89,11 @@
   } else {
     DCHECK(!Dbg::IsAllocTrackingEnabled());
   }
-  if (concurrent_gc_) {
+  // concurrent_gc_ isn't known at compile time so we can optimize by not checking it for
+  // the BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
+  // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant since
+  // the allocator_type should be constant propagated.
+  if (AllocatorMayHaveConcurrentGC(allocator) && concurrent_gc_) {
     CheckConcurrentGC(self, new_num_bytes_allocated, obj);
   }
   if (kIsDebugBuild) {
@@ -105,15 +109,15 @@
 inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
                                               size_t byte_count,
                                               const PreFenceVisitor& pre_fence_visitor) {
-  return AllocObjectWithAllocator<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
-                                                                  kAllocatorTypeLOS,
-                                                                  pre_fence_visitor);
+  return AllocObjectWithAllocator<kInstrumented, false, PreFenceVisitor>(self, klass, byte_count,
+                                                                         kAllocatorTypeLOS,
+                                                                         pre_fence_visitor);
 }
 
 template <const bool kInstrumented, const bool kGrow>
 inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator_type,
                                            size_t alloc_size, size_t* bytes_allocated) {
-  if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(alloc_size))) {
+  if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
     return nullptr;
   }
   if (kInstrumented) {
@@ -153,6 +157,21 @@
       DCHECK(ret == nullptr || large_object_space_->Contains(ret));
       break;
     }
+    case kAllocatorTypeTLAB: {
+      alloc_size = RoundUp(alloc_size, space::BumpPointerSpace::kAlignment);
+      if (UNLIKELY(self->TLABSize() < alloc_size)) {
+        // Try allocating a new thread local buffer, if the allocaiton fails the space must be
+        // full so return nullptr.
+        if (!bump_pointer_space_->AllocNewTLAB(self, alloc_size + kDefaultTLABSize)) {
+          return nullptr;
+        }
+      }
+      // The allocation can't fail.
+      ret = self->AllocTLAB(alloc_size);
+      DCHECK(ret != nullptr);
+      *bytes_allocated = alloc_size;
+      break;
+    }
     default: {
       LOG(FATAL) << "Invalid allocator type";
       ret = nullptr;
@@ -194,14 +213,14 @@
   return byte_count >= kLargeObjectThreshold && have_zygote_space_ && c->IsPrimitiveArray();
 }
 
-template <const bool kGrow>
-inline bool Heap::IsOutOfMemoryOnAllocation(size_t alloc_size) {
+template <bool kGrow>
+inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) {
   size_t new_footprint = num_bytes_allocated_ + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
     }
-    if (!concurrent_gc_) {
+    if (!AllocatorMayHaveConcurrentGC(allocator_type) || !concurrent_gc_) {
       if (!kGrow) {
         return true;
       }