Remove statistics lock to improve performance.

Removes the statistics lock for performance improvements. We now use android atomic operations to avoid write race conditions.

A few other fixes are bundled with this change list.

Deltablue time now down to ~17s,

Change-Id: Ib80ea66c5362903bf637a32eeb8140941457fb7f
diff --git a/src/heap.cc b/src/heap.cc
index 2bf1372..3d84451 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -22,6 +22,7 @@
 #include <limits>
 #include <vector>
 
+#include "atomic.h"
 #include "card_table.h"
 #include "debugger.h"
 #include "heap_bitmap.h"
@@ -258,7 +259,6 @@
   // It's still too early to take a lock because there are no threads yet,
   // but we can create the heap lock now. We don't create it earlier to
   // make it clear that you can't use locks during heap initialization.
-  statistics_lock_ = new Mutex("statistics lock");
   gc_complete_lock_ =  new Mutex("GC complete lock");
   gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable"));
 
@@ -318,7 +318,6 @@
   // all daemon threads are suspended, and we also know that the threads list have been deleted, so
   // those threads can't resume. We're the only running thread, and we can do whatever we like...
   STLDeleteElements(&spaces_);
-  delete statistics_lock_;
   delete gc_complete_lock_;
 
 }
@@ -377,11 +376,7 @@
     if (Dbg::IsAllocTrackingEnabled()) {
       Dbg::RecordAllocation(c, byte_count);
     }
-    bool request_concurrent_gc;
-    {
-      MutexLock mu(*statistics_lock_);
-      request_concurrent_gc = num_bytes_allocated_ >= concurrent_start_bytes_;
-    }
+    const bool request_concurrent_gc = num_bytes_allocated_ >= concurrent_start_bytes_;
     if (request_concurrent_gc) {
       // The SirtRef is necessary since the calls in RequestConcurrentGC are a safepoint.
       SirtRef<Object> ref(obj);
@@ -498,11 +493,12 @@
 
 void Heap::RecordAllocation(AllocSpace* space, const Object* obj) {
   {
-    MutexLock mu(*statistics_lock_);
     size_t size = space->AllocationSize(obj);
     DCHECK_GT(size, 0u);
-    num_bytes_allocated_ += size;
-    num_objects_allocated_ += 1;
+    COMPILE_ASSERT(sizeof(size_t) == sizeof(int32_t),
+                   int32_t_must_be_same_size_as_size_t_for_used_atomic_operations);
+    android_atomic_add(size, reinterpret_cast<volatile int32_t*>(&num_bytes_allocated_));
+    android_atomic_add(1, reinterpret_cast<volatile int32_t*>(&num_objects_allocated_));
 
     if (Runtime::Current()->HasStatsEnabled()) {
       RuntimeStats* global_stats = Runtime::Current()->GetStats();
@@ -525,13 +521,15 @@
 }
 
 void Heap::RecordFree(size_t freed_objects, size_t freed_bytes) {
-  MutexLock mu(*statistics_lock_);
-
+  COMPILE_ASSERT(sizeof(size_t) == sizeof(int32_t),
+                 int32_t_must_be_same_size_as_size_t_for_used_atomic_operations);
   DCHECK_LE(freed_objects, num_objects_allocated_);
-  num_objects_allocated_ -= freed_objects;
+  android_atomic_add(-static_cast<int32_t>(freed_objects),
+                        reinterpret_cast<volatile int32_t*>(&num_objects_allocated_));
 
   DCHECK_LE(freed_bytes, num_bytes_allocated_);
-  num_bytes_allocated_ -= freed_bytes;
+  android_atomic_add(-static_cast<int32_t>(freed_bytes),
+                        reinterpret_cast<volatile int32_t*>(&num_bytes_allocated_));
 
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* global_stats = Runtime::Current()->GetStats();
@@ -672,7 +670,6 @@
 }
 
 int64_t Heap::GetFreeMemory() {
-  MutexLock mu(*statistics_lock_);
   return GetMaxMemory() - num_bytes_allocated_;
 }
 
@@ -832,17 +829,11 @@
     sticky_gc_count_ = 0;
   }
 
-  uint64_t start_time = NanoTime();
-  if (true || concurrent_gc_) {
+  if (concurrent_gc_) {
     CollectGarbageConcurrentMarkSweepPlan(gc_type, clear_soft_references);
   } else {
     CollectGarbageMarkSweepPlan(gc_type, clear_soft_references);
   }
-  const uint64_t gc_duration = NanoTime() - start_time;
-  // For particularly slow GCs lets print out another warning.
-  if (gc_duration > MsToNs(100)) {
-    LOG(WARNING) << "Slow GC took " << PrettyDuration(gc_duration);
-  }
 
   gc_complete_lock_->AssertNotHeld();
   MutexLock mu(*gc_complete_lock_);
@@ -1025,10 +1016,12 @@
   // If the GC was slow, then print timings in the log.
   uint64_t duration = (NanoTime() - start_time) / 1000 * 1000;
   if (duration > MsToNs(50)) {
-    MutexLock mu(*statistics_lock_);
+    const size_t percent_free = GetPercentFree();
+    const size_t num_bytes_allocated = num_bytes_allocated_;
+    const size_t total_memory = GetTotalMemory();
     LOG(INFO) << (gc_type == GC_PARTIAL ? "Partial " : (gc_type == GC_STICKY ? "Sticky " : ""))
-              << "GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree() << "% free, "
-              << PrettySize(num_bytes_allocated_) << "/" << PrettySize(GetTotalMemory()) << ", "
+              << "GC freed " << PrettySize(bytes_freed) << ", " << percent_free << "% free, "
+              << PrettySize(num_bytes_allocated) << "/" << PrettySize(total_memory) << ", "
               << "paused " << PrettyDuration(duration);
   }
 
@@ -1250,13 +1243,16 @@
   // If the GC was slow, then print timings in the log.
   uint64_t pause_roots = (root_end - root_begin) / 1000 * 1000;
   uint64_t pause_dirty = (dirty_end - dirty_begin) / 1000 * 1000;
+  uint64_t duration = (NanoTime() - root_begin) / 1000 * 1000;
   if (pause_roots > MsToNs(5) || pause_dirty > MsToNs(5)) {
-    MutexLock mu(*statistics_lock_);
+    const size_t percent_free = GetPercentFree();
+    const size_t num_bytes_allocated = num_bytes_allocated_;
+    const size_t total_memory = GetTotalMemory();
     LOG(INFO) << (gc_type == GC_PARTIAL ? "Partial " : (gc_type == GC_STICKY ? "Sticky " : ""))
-              << "Concurrent GC freed " << PrettySize(bytes_freed) << ", " << GetPercentFree()
-              << "% free, " << PrettySize(num_bytes_allocated_) << "/"
-              << PrettySize(GetTotalMemory()) << ", " << "paused " << PrettyDuration(pause_roots)
-              << "+" << PrettyDuration(pause_dirty);
+              << "Concurrent GC freed " << PrettySize(bytes_freed) << ", " << percent_free
+              << "% free, " << PrettySize(num_bytes_allocated) << "/"
+              << PrettySize(total_memory) << ", " << "paused " << PrettyDuration(pause_roots)
+              << "+" << PrettyDuration(pause_dirty) << " total " << PrettyDuration(duration);
   }
 
   if (VLOG_IS_ON(heap)) {
@@ -1296,7 +1292,6 @@
 }
 
 void Heap::DumpForSigQuit(std::ostream& os) {
-  MutexLock mu(*statistics_lock_);
   os << "Heap: " << GetPercentFree() << "% free, "
      << PrettySize(num_bytes_allocated_) << "/" << PrettySize(GetTotalMemory())
      << "; " << num_objects_allocated_ << " objects\n";
@@ -1329,7 +1324,6 @@
   size_t target_size;
   bool use_footprint_limit = false;
   {
-    MutexLock mu(*statistics_lock_);
     // We know what our utilization is at this moment.
     // This doesn't actually resize any memory. It just lets the heap grow more when necessary.
     target_size = num_bytes_allocated_ / Heap::GetTargetHeapUtilization();
@@ -1352,7 +1346,6 @@
 
   if (use_footprint_limit) {
     size_t foot_print_limit = alloc_space_->GetFootprintLimit();
-    MutexLock mu(*statistics_lock_);
     concurrent_start_bytes_ = foot_print_limit - concurrent_start_size_;
   }
   SetIdealFootprint(target_size);
@@ -1447,22 +1440,18 @@
 }
 
 size_t Heap::GetBytesAllocated() const {
-  MutexLock mu(*statistics_lock_);
   return num_bytes_allocated_;
 }
 
 size_t Heap::GetObjectsAllocated() const {
-  MutexLock mu(*statistics_lock_);
   return num_objects_allocated_;
 }
 
 size_t Heap::GetConcurrentStartSize() const {
-  MutexLock mu(*statistics_lock_);
   return concurrent_start_size_;
 }
 
 size_t Heap::GetConcurrentMinFree() const {
-  MutexLock mu(*statistics_lock_);
   return concurrent_min_free_;
 }
 
@@ -1530,7 +1519,6 @@
   // not how much use we're making of those pages.
   uint64_t ms_time = NsToMs(NanoTime());
   {
-    MutexLock mu(*statistics_lock_);
     float utilization = static_cast<float>(num_bytes_allocated_) / alloc_space_->Size();
     if ((utilization > 0.75f) || ((ms_time - last_trim_time_) < 2 * 1000)) {
       // Don't bother trimming the heap if it's more than 75% utilized, or if a