Improve the allocation speed.

- Improves the Ritz MemAllocTest benchmark result by ~500 ms (or ~5%) on Nexus 4.
- Move the memset() call that zeroes the allocated memory out of the lock region.
- De-virtualize/Inline the allocation call chains into Heap::AllocObject().
- Turn Heap::measure_allocation_time_ into a static const variable.
- Surround the VerifyObject() call with kIsDebugBuild.

Bug: 9986565
Change-Id: Ib70b6d051a80ec329788b30256565561f031da2a
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index d27290b..9a9e00c 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -21,6 +21,7 @@
 
 #include <limits>
 #include <vector>
+#include <valgrind.h>
 
 #include "base/stl_util.h"
 #include "common_throws.h"
@@ -34,6 +35,7 @@
 #include "gc/collector/mark_sweep-inl.h"
 #include "gc/collector/partial_mark_sweep.h"
 #include "gc/collector/sticky_mark_sweep.h"
+#include "gc/space/dlmalloc_space-inl.h"
 #include "gc/space/image_space.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
@@ -66,6 +68,8 @@
 // Minimum amount of remaining bytes before a concurrent GC is triggered.
 static const size_t kMinConcurrentRemainingBytes = 128 * KB;
 const double Heap::kDefaultTargetUtilization = 0.5;
+// If true, measure the total allocation time.
+static const bool kMeasureAllocationTime = false;
 
 Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
            double target_utilization, size_t capacity,
@@ -118,9 +122,9 @@
       max_free_(max_free),
       target_utilization_(target_utilization),
       total_wait_time_(0),
-      measure_allocation_time_(false),
       total_allocation_time_(0),
-      verify_object_mode_(kHeapVerificationNotPermitted) {
+      verify_object_mode_(kHeapVerificationNotPermitted),
+      running_on_valgrind_(RUNNING_ON_VALGRIND) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
@@ -347,7 +351,7 @@
   }
   os << "Total number of allocations: " << total_objects_allocated << "\n";
   os << "Total bytes allocated " << PrettySize(total_bytes_allocated) << "\n";
-  if (measure_allocation_time_) {
+  if (kMeasureAllocationTime) {
     os << "Total time spent allocating: " << PrettyDuration(allocation_time) << "\n";
     os << "Mean allocation time: " << PrettyDuration(allocation_time / total_objects_allocated)
        << "\n";
@@ -445,8 +449,9 @@
 
   mirror::Object* obj = NULL;
   size_t size = 0;
+  size_t bytes_allocated;
   uint64_t allocation_start = 0;
-  if (UNLIKELY(measure_allocation_time_)) {
+  if (UNLIKELY(kMeasureAllocationTime)) {
     allocation_start = NanoTime() / kTimeAdjust;
   }
 
@@ -458,18 +463,20 @@
       byte_count >= large_object_threshold_ && have_zygote_space_ && c->IsPrimitiveArray();
   if (UNLIKELY(large_object_allocation)) {
     size = RoundUp(byte_count, kPageSize);
-    obj = Allocate(self, large_object_space_, size);
+    obj = Allocate(self, large_object_space_, size, &bytes_allocated);
+    DCHECK(obj == NULL || size == bytes_allocated);
     // Make sure that our large object didn't get placed anywhere within the space interval or else
     // it breaks the immune range.
     DCHECK(obj == NULL ||
            reinterpret_cast<byte*>(obj) < continuous_spaces_.front()->Begin() ||
            reinterpret_cast<byte*>(obj) >= continuous_spaces_.back()->End());
   } else {
-    obj = Allocate(self, alloc_space_, byte_count);
+    obj = Allocate(self, alloc_space_, byte_count, &bytes_allocated);
+    DCHECK(obj == NULL || size <= bytes_allocated);
+    size = bytes_allocated;
 
     // Ensure that we did not allocate into a zygote space.
     DCHECK(obj == NULL || !have_zygote_space_ || !FindSpaceFromObject(obj, false)->IsZygoteSpace());
-    size = alloc_space_->AllocationSize(obj);
   }
 
   if (LIKELY(obj != NULL)) {
@@ -487,9 +494,11 @@
       SirtRef<mirror::Object> ref(self, obj);
       RequestConcurrentGC(self);
     }
-    VerifyObject(obj);
+    if (kDesiredHeapVerification > kNoHeapVerification) {
+      VerifyObject(obj);
+    }
 
-    if (UNLIKELY(measure_allocation_time_)) {
+    if (UNLIKELY(kMeasureAllocationTime)) {
       total_allocation_time_.fetch_add(NanoTime() / kTimeAdjust - allocation_start);
     }
 
@@ -645,7 +654,7 @@
   GetLiveBitmap()->Walk(Heap::VerificationCallback, this);
 }
 
-void Heap::RecordAllocation(size_t size, mirror::Object* obj) {
+inline void Heap::RecordAllocation(size_t size, mirror::Object* obj) {
   DCHECK(obj != NULL);
   DCHECK_GT(size, 0u);
   num_bytes_allocated_.fetch_add(size);
@@ -684,37 +693,55 @@
   }
 }
 
-mirror::Object* Heap::TryToAllocate(Thread* self, space::AllocSpace* space, size_t alloc_size,
-                                    bool grow) {
-  // Should we try to use a CAS here and fix up num_bytes_allocated_ later with AllocationSize?
-  if (num_bytes_allocated_ + alloc_size > max_allowed_footprint_) {
-    // max_allowed_footprint_ <= growth_limit_ so it is safe to check in here.
-    if (num_bytes_allocated_ + alloc_size > growth_limit_) {
-      // Completely out of memory.
-      return NULL;
-    }
-  }
-
-  return space->Alloc(self, alloc_size);
+inline bool Heap::IsOutOfMemoryOnAllocation(size_t alloc_size) {
+  return num_bytes_allocated_ + alloc_size > growth_limit_;
 }
 
-mirror::Object* Heap::Allocate(Thread* self, space::AllocSpace* space, size_t alloc_size) {
+inline mirror::Object* Heap::TryToAllocate(Thread* self, space::AllocSpace* space, size_t alloc_size,
+                                           bool grow, size_t* bytes_allocated) {
+  if (IsOutOfMemoryOnAllocation(alloc_size)) {
+    return NULL;
+  }
+  return space->Alloc(self, alloc_size, bytes_allocated);
+}
+
+// DlMallocSpace-specific version.
+inline mirror::Object* Heap::TryToAllocate(Thread* self, space::DlMallocSpace* space, size_t alloc_size,
+                                           bool grow, size_t* bytes_allocated) {
+  if (IsOutOfMemoryOnAllocation(alloc_size)) {
+    return NULL;
+  }
+  if (!running_on_valgrind_) {
+    return space->AllocNonvirtual(self, alloc_size, bytes_allocated);
+  } else {
+    return space->Alloc(self, alloc_size, bytes_allocated);
+  }
+}
+
+template <class T>
+inline mirror::Object* Heap::Allocate(Thread* self, T* space, size_t alloc_size, size_t* bytes_allocated) {
   // Since allocation can cause a GC which will need to SuspendAll, make sure all allocations are
   // done in the runnable state where suspension is expected.
   DCHECK_EQ(self->GetState(), kRunnable);
   self->AssertThreadSuspensionIsAllowable();
 
-  mirror::Object* ptr = TryToAllocate(self, space, alloc_size, false);
+  mirror::Object* ptr = TryToAllocate(self, space, alloc_size, false, bytes_allocated);
   if (ptr != NULL) {
     return ptr;
   }
+  return AllocateInternalWithGc(self, space, alloc_size, bytes_allocated);
+}
+
+mirror::Object* Heap::AllocateInternalWithGc(Thread* self, space::AllocSpace* space, size_t alloc_size,
+                                             size_t* bytes_allocated) {
+  mirror::Object* ptr;
 
   // The allocation failed. If the GC is running, block until it completes, and then retry the
   // allocation.
   collector::GcType last_gc = WaitForConcurrentGcToComplete(self);
   if (last_gc != collector::kGcTypeNone) {
     // A GC was in progress and we blocked, retry allocation now that memory has been freed.
-    ptr = TryToAllocate(self, space, alloc_size, false);
+    ptr = TryToAllocate(self, space, alloc_size, false, bytes_allocated);
     if (ptr != NULL) {
       return ptr;
     }
@@ -749,7 +776,7 @@
       i = static_cast<size_t>(gc_type_ran);
 
       // Did we free sufficient memory for the allocation to succeed?
-      ptr = TryToAllocate(self, space, alloc_size, false);
+      ptr = TryToAllocate(self, space, alloc_size, false, bytes_allocated);
       if (ptr != NULL) {
         return ptr;
       }
@@ -758,7 +785,7 @@
 
   // Allocations have failed after GCs;  this is an exceptional state.
   // Try harder, growing the heap if necessary.
-  ptr = TryToAllocate(self, space, alloc_size, true);
+  ptr = TryToAllocate(self, space, alloc_size, true, bytes_allocated);
   if (ptr != NULL) {
     return ptr;
   }
@@ -773,7 +800,7 @@
 
   // We don't need a WaitForConcurrentGcToComplete here either.
   CollectGarbageInternal(collector::kGcTypeFull, kGcCauseForAlloc, true);
-  return TryToAllocate(self, space, alloc_size, true);
+  return TryToAllocate(self, space, alloc_size, true, bytes_allocated);
 }
 
 void Heap::SetTargetHeapUtilization(float target) {