Allow mixing of thread offsets between 32 and 64bit architectures.

Begin a more full implementation x86-64 REX prefixes.
Doesn't implement 64bit thread offset support for the JNI compiler.

Change-Id: If9af2f08a1833c21ddb4b4077f9b03add1a05147
diff --git a/runtime/thread.h b/runtime/thread.h
index 63d22c5..59fe724 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -92,7 +92,7 @@
   kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
 };
 
-class PACKED(4) Thread {
+class Thread {
  public:
   // Space to throw a StackOverflowError in.
   // TODO: shrink reserved space, in particular for 64bit.
@@ -145,7 +145,8 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Translates 172 to pAllocArrayFromCode and so on.
-  static void DumpThreadOffset(std::ostream& os, uint32_t offset, size_t size_of_pointers);
+  template<size_t size_of_pointers>
+  static void DumpThreadOffset(std::ostream& os, uint32_t offset);
 
   // Dumps a one-line summary of thread state (used for operator<<).
   void ShortDump(std::ostream& os) const;
@@ -162,32 +163,24 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ThreadState GetState() const {
-    DCHECK(state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended);
-    return static_cast<ThreadState>(state_and_flags_.as_struct.state);
-  }
-
-  // This function can be used to make sure a thread's state is valid.
-  void CheckState(int id) const {
-    if (state_and_flags_.as_struct.state >= kTerminated && state_and_flags_.as_struct.state <= kSuspended) {
-      return;
-    }
-    LOG(INFO) << "Thread " << this << " state is invalid: " << state_and_flags_.as_struct.state << " id=" << id;
-    CHECK(false);
+    DCHECK_GE(tls32_.state_and_flags.as_struct.state, kTerminated);
+    DCHECK_LE(tls32_.state_and_flags.as_struct.state, kSuspended);
+    return static_cast<ThreadState>(tls32_.state_and_flags.as_struct.state);
   }
 
   ThreadState SetState(ThreadState new_state);
 
   int GetSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return suspend_count_;
+    return tls32_.suspend_count;
   }
 
   int GetDebugSuspendCount() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return debug_suspend_count_;
+    return tls32_.debug_suspend_count;
   }
 
   bool IsSuspended() const {
     union StateAndFlags state_and_flags;
-    state_and_flags.as_int = state_and_flags_.as_int;
+    state_and_flags.as_int = tls32_.state_and_flags.as_int;
     return state_and_flags.as_struct.state != kRunnable &&
         (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
@@ -221,9 +214,9 @@
   const char* StartAssertNoThreadSuspension(const char* cause) {
     if (kIsDebugBuild) {
       CHECK(cause != NULL);
-      const char* previous_cause = last_no_thread_suspension_cause_;
-      no_thread_suspension_++;
-      last_no_thread_suspension_cause_ = cause;
+      const char* previous_cause = tlsPtr_.last_no_thread_suspension_cause;
+      tls32_.no_thread_suspension++;
+      tlsPtr_.last_no_thread_suspension_cause = cause;
       return previous_cause;
     } else {
       return nullptr;
@@ -233,20 +226,20 @@
   // End region where no thread suspension is expected.
   void EndAssertNoThreadSuspension(const char* old_cause) {
     if (kIsDebugBuild) {
-      CHECK(old_cause != NULL || no_thread_suspension_ == 1);
-      CHECK_GT(no_thread_suspension_, 0U);
-      no_thread_suspension_--;
-      last_no_thread_suspension_cause_ = old_cause;
+      CHECK(old_cause != nullptr || tls32_.no_thread_suspension == 1);
+      CHECK_GT(tls32_.no_thread_suspension, 0U);
+      tls32_.no_thread_suspension--;
+      tlsPtr_.last_no_thread_suspension_cause = old_cause;
     }
   }
 
   void AssertThreadSuspensionIsAllowable(bool check_locks = true) const;
 
   bool IsDaemon() const {
-    return daemon_;
+    return tls32_.daemon;
   }
 
-  bool HoldsLock(mirror::Object*) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool HoldsLock(mirror::Object*) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
    * Changes the priority of this thread to match that of the java.lang.Thread object.
@@ -265,11 +258,11 @@
   static int GetNativePriority();
 
   uint32_t GetThreadId() const {
-    return thin_lock_thread_id_;
+    return tls32_.thin_lock_thread_id;
   }
 
   pid_t GetTid() const {
-    return tid_;
+    return tls32_.tid;
   }
 
   // Returns the java.lang.Thread's name, or NULL if this Thread* doesn't have a peer.
@@ -287,30 +280,30 @@
   uint64_t GetCpuMicroTime() const;
 
   mirror::Object* GetPeer() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    CHECK(jpeer_ == NULL);
-    return opeer_;
+    CHECK(tlsPtr_.jpeer == nullptr);
+    return tlsPtr_.opeer;
   }
 
   bool HasPeer() const {
-    return jpeer_ != NULL || opeer_ != NULL;
+    return tlsPtr_.jpeer != nullptr || tlsPtr_.opeer != nullptr;
   }
 
   RuntimeStats* GetStats() {
-    return &stats_;
+    return &tls64_.stats;
   }
 
   bool IsStillStarting() const;
 
   bool IsExceptionPending() const {
-    return exception_ != NULL;
+    return tlsPtr_.exception != nullptr;
   }
 
   mirror::Throwable* GetException(ThrowLocation* throw_location) const
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    if (throw_location != NULL) {
-      *throw_location = throw_location_;
+    if (throw_location != nullptr) {
+      *throw_location = tlsPtr_.throw_location;
     }
-    return exception_;
+    return tlsPtr_.exception;
   }
 
   void AssertNoPendingException() const;
@@ -320,13 +313,13 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(new_exception != NULL);
     // TODO: DCHECK(!IsExceptionPending());
-    exception_ = new_exception;
-    throw_location_ = throw_location;
+    tlsPtr_.exception = new_exception;
+    tlsPtr_.throw_location = throw_location;
   }
 
   void ClearException() {
-    exception_ = NULL;
-    throw_location_.Clear();
+    tlsPtr_.exception = nullptr;
+    tlsPtr_.throw_location.Clear();
   }
 
   // Find catch block and perform long jump to appropriate exception handle
@@ -334,8 +327,8 @@
 
   Context* GetLongJumpContext();
   void ReleaseLongJumpContext(Context* context) {
-    DCHECK(long_jump_context_ == NULL);
-    long_jump_context_ = context;
+    DCHECK(tlsPtr_.long_jump_context == nullptr);
+    tlsPtr_.long_jump_context = context;
   }
 
   mirror::ArtMethod* GetCurrentMethod(uint32_t* dex_pc) const
@@ -344,16 +337,17 @@
   ThrowLocation GetCurrentLocationForThrow() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void SetTopOfStack(mirror::ArtMethod** top_method, uintptr_t pc) {
-    managed_stack_.SetTopQuickFrame(top_method);
-    managed_stack_.SetTopQuickFramePc(pc);
+    tlsPtr_.managed_stack.SetTopQuickFrame(top_method);
+    tlsPtr_.managed_stack.SetTopQuickFramePc(pc);
   }
 
   void SetTopOfShadowStack(ShadowFrame* top) {
-    managed_stack_.SetTopShadowFrame(top);
+    tlsPtr_.managed_stack.SetTopShadowFrame(top);
   }
 
   bool HasManagedStack() const {
-    return managed_stack_.GetTopQuickFrame() != NULL || managed_stack_.GetTopShadowFrame() != NULL;
+    return (tlsPtr_.managed_stack.GetTopQuickFrame() != nullptr) ||
+        (tlsPtr_.managed_stack.GetTopShadowFrame() != nullptr);
   }
 
   // If 'msg' is NULL, no detail message is set.
@@ -387,21 +381,65 @@
 
   // JNI methods
   JNIEnvExt* GetJniEnv() const {
-    return jni_env_;
+    return tlsPtr_.jni_env;
   }
 
   // Convert a jobject into a Object*
   mirror::Object* DecodeJObject(jobject obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  mirror::Object* GetMonitorEnterObject() const {
+    return tlsPtr_.monitor_enter_object;
+  }
+
+  void SetMonitorEnterObject(mirror::Object* obj) {
+    tlsPtr_.monitor_enter_object = obj;
+  }
+
   // Implements java.lang.Thread.interrupted.
-  bool Interrupted();
+  bool Interrupted() LOCKS_EXCLUDED(wait_mutex_);
   // Implements java.lang.Thread.isInterrupted.
-  bool IsInterrupted();
-  void Interrupt();
-  void Notify();
+  bool IsInterrupted() LOCKS_EXCLUDED(wait_mutex_);
+  bool IsInterruptedLocked() EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return interrupted_;
+  }
+  void Interrupt(Thread* self) LOCKS_EXCLUDED(wait_mutex_);
+  void SetInterruptedLocked(bool i) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    interrupted_ = i;
+  }
+  void Notify() LOCKS_EXCLUDED(wait_mutex_);
+
+ private:
+  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
+
+ public:
+  Mutex* GetWaitMutex() const LOCK_RETURNED(wait_mutex_) {
+    return wait_mutex_;
+  }
+
+  ConditionVariable* GetWaitConditionVariable() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_cond_;
+  }
+
+  Monitor* GetWaitMonitor() const EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    return wait_monitor_;
+  }
+
+  void SetWaitMonitor(Monitor* mon) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_) {
+    wait_monitor_ = mon;
+  }
+
+
+  // Waiter link-list support.
+  Thread* GetWaitNext() const {
+    return tlsPtr_.wait_next;
+  }
+
+  void SetWaitNext(Thread* next) {
+    tlsPtr_.wait_next = next;
+  }
 
   mirror::ClassLoader* GetClassLoaderOverride() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return class_loader_override_;
+    return tlsPtr_.class_loader_override;
   }
 
   void SetClassLoaderOverride(mirror::ClassLoader* class_loader_override)
@@ -428,41 +466,99 @@
   // Offsets of various members of native Thread class, used by compiled code.
   //
 
-  static ThreadOffset SelfOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, self_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThinLockIdOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, thin_lock_thread_id));
   }
 
-  static ThreadOffset ExceptionOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, exception_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadFlagsOffset() {
+    return ThreadOffset<pointer_size>(
+        OFFSETOF_MEMBER(Thread, tls32_) +
+        OFFSETOF_MEMBER(tls_32bit_sized_values, state_and_flags));
   }
 
-  static ThreadOffset PeerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, opeer_));
+ private:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadOffsetFromTlsPtr(size_t tls_ptr_offset) {
+    size_t base = OFFSETOF_MEMBER(Thread, tlsPtr_);
+    size_t scale;
+    size_t shrink;
+    if (pointer_size == sizeof(void*)) {
+      scale = 1;
+      shrink = 1;
+    } else if (pointer_size > sizeof(void*)) {
+      scale = pointer_size / sizeof(void*);
+      shrink = 1;
+    } else {
+      DCHECK_GT(sizeof(void*), pointer_size);
+      scale = 1;
+      shrink = sizeof(void*) / pointer_size;
+    }
+    return ThreadOffset<pointer_size>(base + ((tls_ptr_offset * scale) / shrink));
   }
 
-  static ThreadOffset ThinLockIdOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, thin_lock_thread_id_));
+ public:
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> QuickEntryPointOffset(size_t quick_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, quick_entrypoints) + quick_entrypoint_offset);
   }
 
-  static ThreadOffset CardTableOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, card_table_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> InterpreterEntryPointOffset(size_t interp_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, interpreter_entrypoints) + interp_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadFlagsOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, state_and_flags_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEntryPointOffset(size_t jni_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_entrypoints) + jni_entrypoint_offset);
   }
 
-  static ThreadOffset ThreadSuspendTriggerOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, suspend_trigger_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PortableEntryPointOffset(size_t port_entrypoint_offset) {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, portable_entrypoints) + port_entrypoint_offset);
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> SelfOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, self));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ExceptionOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, exception));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> PeerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, opeer));
+  }
+
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> CardTableOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, card_table));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadSuspendTriggerOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, suspend_trigger));
   }
 
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
-    return stack_size_ - (stack_end_ - stack_begin_);
+    return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);
   }
 
   byte* GetStackEnd() const {
-    return stack_end_;
+    return tlsPtr_.stack_end;
   }
 
   // Set the stack end to that to be used during a stack overflow
@@ -475,9 +571,9 @@
     if (implicit_overflow_check) {
       // For implicit checks we also need to add in the protected region above the
       // overflow region.
-      stack_end_ = stack_begin_ + kStackOverflowImplicitCheckSize;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowImplicitCheckSize;
     } else {
-      stack_end_ = stack_begin_ + kStackOverflowReservedBytes;
+      tlsPtr_.stack_end = tlsPtr_.stack_begin + kStackOverflowReservedBytes;
     }
   }
 
@@ -485,55 +581,65 @@
   void InstallImplicitProtection(bool is_main_stack);
 
   bool IsHandlingStackOverflow() const {
-    return stack_end_ == stack_begin_;
+    return tlsPtr_.stack_end == tlsPtr_.stack_begin;
   }
 
-  static ThreadOffset StackEndOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, stack_end_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> StackEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, stack_end));
   }
 
-  static ThreadOffset JniEnvOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, jni_env_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> JniEnvOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, jni_env));
   }
 
-  static ThreadOffset TopOfManagedStackOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFrameOffset());
   }
 
-  static ThreadOffset TopOfManagedStackPcOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopQuickFramePcOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopOfManagedStackPcOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopQuickFramePcOffset());
   }
 
   const ManagedStack* GetManagedStack() const {
-    return &managed_stack_;
+    return &tlsPtr_.managed_stack;
   }
 
   // Linked list recording fragments of managed stack.
   void PushManagedStackFragment(ManagedStack* fragment) {
-    managed_stack_.PushManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PushManagedStackFragment(fragment);
   }
   void PopManagedStackFragment(const ManagedStack& fragment) {
-    managed_stack_.PopManagedStackFragment(fragment);
+    tlsPtr_.managed_stack.PopManagedStackFragment(fragment);
   }
 
   ShadowFrame* PushShadowFrame(ShadowFrame* new_top_frame) {
-    return managed_stack_.PushShadowFrame(new_top_frame);
+    return tlsPtr_.managed_stack.PushShadowFrame(new_top_frame);
   }
 
   ShadowFrame* PopShadowFrame() {
-    return managed_stack_.PopShadowFrame();
+    return tlsPtr_.managed_stack.PopShadowFrame();
   }
 
-  static ThreadOffset TopShadowFrameOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, managed_stack_) +
-                        ManagedStack::TopShadowFrameOffset());
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopShadowFrameOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(
+        OFFSETOF_MEMBER(tls_ptr_sized_values, managed_stack) +
+        ManagedStack::TopShadowFrameOffset());
   }
 
   // Number of references allocated in JNI ShadowFrames on this thread.
   size_t NumJniShadowFrameReferences() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return managed_stack_.NumJniShadowFrameReferences();
+    return tlsPtr_.managed_stack.NumJniShadowFrameReferences();
   }
 
   // Number of references in SIRTs on this thread.
@@ -551,27 +657,28 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void PushSirt(StackIndirectReferenceTable* sirt) {
-    sirt->SetLink(top_sirt_);
-    top_sirt_ = sirt;
+    sirt->SetLink(tlsPtr_.top_sirt);
+    tlsPtr_.top_sirt = sirt;
   }
 
   StackIndirectReferenceTable* PopSirt() {
-    StackIndirectReferenceTable* sirt = top_sirt_;
+    StackIndirectReferenceTable* sirt = tlsPtr_.top_sirt;
     DCHECK(sirt != NULL);
-    top_sirt_ = top_sirt_->GetLink();
+    tlsPtr_.top_sirt = tlsPtr_.top_sirt->GetLink();
     return sirt;
   }
 
-  static ThreadOffset TopSirtOffset() {
-    return ThreadOffset(OFFSETOF_MEMBER(Thread, top_sirt_));
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> TopSirtOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, top_sirt));
   }
 
-  DebugInvokeReq* GetInvokeReq() {
-    return debug_invoke_req_;
+  DebugInvokeReq* GetInvokeReq() const {
+    return tlsPtr_.debug_invoke_req;
   }
 
   SingleStepControl* GetSingleStepControl() const {
-    return single_step_control_;
+    return tlsPtr_.single_step_control;
   }
 
   void SetDeoptimizationShadowFrame(ShadowFrame* sf);
@@ -580,41 +687,41 @@
   ShadowFrame* GetAndClearDeoptimizationShadowFrame(JValue* ret_val);
 
   std::deque<instrumentation::InstrumentationStackFrame>* GetInstrumentationStack() {
-    return instrumentation_stack_;
+    return tlsPtr_.instrumentation_stack;
   }
 
   std::vector<mirror::ArtMethod*>* GetStackTraceSample() const {
-    return stack_trace_sample_;
+    return tlsPtr_.stack_trace_sample;
   }
 
   void SetStackTraceSample(std::vector<mirror::ArtMethod*>* sample) {
-    stack_trace_sample_ = sample;
+    tlsPtr_.stack_trace_sample = sample;
   }
 
   uint64_t GetTraceClockBase() const {
-    return trace_clock_base_;
+    return tls64_.trace_clock_base;
   }
 
   void SetTraceClockBase(uint64_t clock_base) {
-    trace_clock_base_ = clock_base;
+    tls64_.trace_clock_base = clock_base;
   }
 
   BaseMutex* GetHeldMutex(LockLevel level) const {
-    return held_mutexes_[level];
+    return tlsPtr_.held_mutexes[level];
   }
 
   void SetHeldMutex(LockLevel level, BaseMutex* mutex) {
-    held_mutexes_[level] = mutex;
+    tlsPtr_.held_mutexes[level] = mutex;
   }
 
   void RunCheckpointFunction();
 
   bool ReadFlag(ThreadFlag flag) const {
-    return (state_and_flags_.as_struct.flags & flag) != 0;
+    return (tls32_.state_and_flags.as_struct.flags & flag) != 0;
   }
 
   bool TestAllFlags() const {
-    return (state_and_flags_.as_struct.flags != 0);
+    return (tls32_.state_and_flags.as_struct.flags != 0);
   }
 
   void AtomicSetFlag(ThreadFlag flag);
@@ -623,11 +730,57 @@
 
   void ResetQuickAllocEntryPointsForThread();
 
- private:
-  // We have no control over the size of 'bool', but want our boolean fields
-  // to be 4-byte quantities.
-  typedef uint32_t bool32_t;
+  // Returns the remaining space in the TLAB.
+  size_t TlabSize() const;
+  // Doesn't check that there is room.
+  mirror::Object* AllocTlab(size_t bytes);
+  void SetTlab(byte* start, byte* end);
+  bool HasTlab() const;
 
+  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
+  // equal to a valid pointer.
+  // TODO: does this need to atomic?  I don't think so.
+  void RemoveSuspendTrigger() {
+    tlsPtr_.suspend_trigger = reinterpret_cast<uintptr_t*>(&tlsPtr_.suspend_trigger);
+  }
+
+  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
+  // The next time a suspend check is done, it will load from the value at this address
+  // and trigger a SIGSEGV.
+  void TriggerSuspend() {
+    tlsPtr_.suspend_trigger = nullptr;
+  }
+
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
+  size_t GetThreadLocalBytesAllocated() const {
+    return tlsPtr_.thread_local_pos - tlsPtr_.thread_local_start;
+  }
+
+  size_t GetThreadLocalObjectsAllocated() const {
+    return tlsPtr_.thread_local_objects;
+  }
+
+  // ROS alloc TLS.
+  static constexpr size_t kRosAllocNumOfSizeBrackets = 34;
+
+  void* GetRosAllocRun(size_t index) const {
+    return tlsPtr_.rosalloc_runs[index];
+  }
+
+  void SetRosAllocRun(size_t index, void* run) {
+    tlsPtr_.rosalloc_runs[index] = run;
+  }
+
+ private:
   explicit Thread(bool daemon);
   ~Thread() LOCKS_EXCLUDED(Locks::mutator_lock_,
                            Locks::thread_suspend_count_lock_);
@@ -644,7 +797,7 @@
   // Dbg::Disconnected.
   ThreadState SetStateUnsafe(ThreadState new_state) {
     ThreadState old_state = GetState();
-    state_and_flags_.as_struct.state = new_state;
+    tls32_.state_and_flags.as_struct.state = new_state;
     return old_state;
   }
 
@@ -678,22 +831,6 @@
   void SetUpAlternateSignalStack();
   void TearDownAlternateSignalStack();
 
-  void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
-
-  static void ThreadExitCallback(void* arg);
-
-  // Has Thread::Startup been called?
-  static bool is_started_;
-
-  // TLS key used to retrieve the Thread*.
-  static pthread_key_t pthread_key_self_;
-
-  // Used to notify threads that they should attempt to resume, they will suspend again if
-  // their suspend count is > 0.
-  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // --- Frequently accessed fields first for short offsets ---
-
   // 32 bits of atomically changed state and flags. Keeping as 32 bits allows and atomic CAS to
   // change from being Suspended to Runnable without a suspend request occurring.
   union PACKED(4) StateAndFlags {
@@ -715,206 +852,225 @@
     // See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47409
     DISALLOW_COPY_AND_ASSIGN(StateAndFlags);
   };
-  union StateAndFlags state_and_flags_;
-  COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
-                 sizeof_state_and_flags_and_int32_are_different);
 
-  // A non-zero value is used to tell the current thread to enter a safe point
-  // at the next poll.
-  int suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // The biased card table, see CardTable for details
-  byte* card_table_;
-
-  // The pending exception or NULL.
-  mirror::Throwable* exception_;
-
-  // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
-  // We leave extra space so there's room for the code that throws StackOverflowError.
-  byte* stack_end_;
-
-  // The top of the managed stack often manipulated directly by compiler generated code.
-  ManagedStack managed_stack_;
-
-  // Every thread may have an associated JNI environment
-  JNIEnvExt* jni_env_;
-
-  // Initialized to "this". On certain architectures (such as x86) reading
-  // off of Thread::Current is easy but getting the address of Thread::Current
-  // is hard. This field can be read off of Thread::Current to give the address.
-  Thread* self_;
-
-  // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
-  // start up, until the thread is registered and the local opeer_ is used.
-  mirror::Object* opeer_;
-  jobject jpeer_;
-
-  // The "lowest addressable byte" of the stack
-  byte* stack_begin_;
-
-  // Size of the stack
-  size_t stack_size_;
-
-  // Thin lock thread id. This is a small integer used by the thin lock implementation.
-  // This is not to be confused with the native thread's tid, nor is it the value returned
-  // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
-  // important difference between this id and the ids visible to managed code is that these
-  // ones get reused (to ensure that they fit in the number of bits available).
-  uint32_t thin_lock_thread_id_;
-
-  // Pointer to previous stack trace captured by sampling profiler.
-  std::vector<mirror::ArtMethod*>* stack_trace_sample_;
-
-  // The clock base used for tracing.
-  uint64_t trace_clock_base_;
-
-  // System thread id.
-  pid_t tid_;
-
-  ThrowLocation throw_location_;
-
-  // Guards the 'interrupted_' and 'wait_monitor_' members.
-  mutable Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // Condition variable waited upon during a wait.
-  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
-  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
-  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
-  // Thread "interrupted" status; stays raised until queried or thrown.
-  bool32_t interrupted_ GUARDED_BY(wait_mutex_);
-  // The next thread in the wait set this thread is part of or NULL if not waiting.
-  Thread* wait_next_;
-
-
-  // If we're blocked in MonitorEnter, this is the object we're trying to lock.
-  mirror::Object* monitor_enter_object_;
-
-  // Top of linked list of stack indirect reference tables or NULL for none
-  StackIndirectReferenceTable* top_sirt_;
-
-  Runtime* runtime_;
-
-  RuntimeStats stats_;
-
-  // Needed to get the right ClassLoader in JNI_OnLoad, but also
-  // useful for testing.
-  mirror::ClassLoader* class_loader_override_;
-
-  // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
-  Context* long_jump_context_;
-
-  // A boolean telling us whether we're recursively throwing OOME.
-  bool32_t throwing_OutOfMemoryError_;
-
-  // How much of 'suspend_count_' is by request of the debugger, used to set things right
-  // when the debugger detaches. Must be <= suspend_count_.
-  int debug_suspend_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
-
-  // JDWP invoke-during-breakpoint support.
-  DebugInvokeReq* debug_invoke_req_;
-
-  // JDWP single-stepping support.
-  SingleStepControl* single_step_control_;
-
-  // Shadow frame that is used temporarily during the deoptimization of a method.
-  ShadowFrame* deoptimization_shadow_frame_;
-  JValue deoptimization_return_value_;
-
-  // Additional stack used by method instrumentation to store method and return pc values.
-  // Stored as a pointer since std::deque is not PACKED.
-  std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack_;
-
-  // A cached copy of the java.lang.Thread's name.
-  std::string* name_;
-
-  // Is the thread a daemon?
-  const bool32_t daemon_;
-
-  // A cached pthread_t for the pthread underlying this Thread*.
-  pthread_t pthread_self_;
-
-  // Support for Mutex lock hierarchy bug detection.
-  BaseMutex* held_mutexes_[kLockLevelCount];
-
-  // A positive value implies we're in a region where thread suspension isn't expected.
-  uint32_t no_thread_suspension_;
-
-  // If no_thread_suspension_ is > 0, what is causing that assertion.
-  const char* last_no_thread_suspension_cause_;
+  static void ThreadExitCallback(void* arg);
 
   // Maximum number of checkpoint functions.
   static constexpr uint32_t kMaxCheckpoints = 3;
 
-  // Pending checkpoint function or NULL if non-pending. Installation guarding by
-  // Locks::thread_suspend_count_lock_.
-  Closure* checkpoint_functions_[kMaxCheckpoints];
+  // Has Thread::Startup been called?
+  static bool is_started_;
 
- public:
-  // Entrypoint function pointers
-  // TODO: move this near the top, since changing its offset requires all oats to be recompiled!
-  InterpreterEntryPoints interpreter_entrypoints_;
-  JniEntryPoints jni_entrypoints_;
-  PortableEntryPoints portable_entrypoints_;
-  QuickEntryPoints quick_entrypoints_;
+  // TLS key used to retrieve the Thread*.
+  static pthread_key_t pthread_key_self_;
 
-  // Setting this to 0 will trigger a SEGV and thus a suspend check.  It is normally
-  // set to the address of itself.
-  uintptr_t* suspend_trigger_;
+  // Used to notify threads that they should attempt to resume, they will suspend again if
+  // their suspend count is > 0.
+  static ConditionVariable* resume_cond_ GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // How many times has our pthread key's destructor been called?
-  uint32_t thread_exit_check_count_;
+  /***********************************************************************************************/
+  // Thread local storage. Fields are grouped by size to enable 32 <-> 64 searching to account for
+  // pointer size differences. To encourage shorter encoding, more frequently used values appear
+  // first if possible.
+  /***********************************************************************************************/
 
-  // Thread-local allocation pointer.
-  byte* thread_local_start_;
-  byte* thread_local_pos_;
-  byte* thread_local_end_;
-  size_t thread_local_objects_;
-  // Returns the remaining space in the TLAB.
-  size_t TlabSize() const;
-  // Doesn't check that there is room.
-  mirror::Object* AllocTlab(size_t bytes);
-  void SetTlab(byte* start, byte* end);
-  bool HasTlab() const;
+  struct PACKED(4)  tls_32bit_sized_values {
+    // We have no control over the size of 'bool', but want our boolean fields
+    // to be 4-byte quantities.
+    typedef uint32_t bool32_t;
 
-  // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
-  // equal to a valid pointer.
-  // TODO: does this need to atomic?  I don't think so.
-  void RemoveSuspendTrigger() {
-    suspend_trigger_ = reinterpret_cast<uintptr_t*>(&suspend_trigger_);
-  }
+    explicit tls_32bit_sized_values(bool is_daemon) :
+      suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
+      daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
+      thread_exit_check_count(0) {
+    }
 
-  // Trigger a suspend check by making the suspend_trigger_ TLS value an invalid pointer.
-  // The next time a suspend check is done, it will load from the value at this address
-  // and trigger a SIGSEGV.
-  void TriggerSuspend() {
-    suspend_trigger_ = nullptr;
-  }
+    union StateAndFlags state_and_flags;
+    COMPILE_ASSERT(sizeof(union StateAndFlags) == sizeof(int32_t),
+                   sizeof_state_and_flags_and_int32_are_different);
 
-  // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
-  // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
-  // RosAlloc class due to a header file circular dependency issue.
-  // To compensate, we check that the two values match at RosAlloc
-  // initialization time.
-  static const size_t kRosAllocNumOfSizeBrackets = 34;
-  void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
+    // A non-zero value is used to tell the current thread to enter a safe point
+    // at the next poll.
+    int suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Thread-local allocation stack data/routines.
-  mirror::Object** thread_local_alloc_stack_top_;
-  mirror::Object** thread_local_alloc_stack_end_;
+    // How much of 'suspend_count_' is by request of the debugger, used to set things right
+    // when the debugger detaches. Must be <= suspend_count_.
+    int debug_suspend_count GUARDED_BY(Locks::thread_suspend_count_lock_);
 
-  // Push an object onto the allocation stack.
-  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+    // Thin lock thread id. This is a small integer used by the thin lock implementation.
+    // This is not to be confused with the native thread's tid, nor is it the value returned
+    // by java.lang.Thread.getId --- this is a distinct value, used only for locking. One
+    // important difference between this id and the ids visible to managed code is that these
+    // ones get reused (to ensure that they fit in the number of bits available).
+    uint32_t thin_lock_thread_id;
 
-  // Set the thread local allocation pointers to the given pointers.
-  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+    // System thread id.
+    uint32_t tid;
 
-  // Resets the thread local allocation pointers.
-  void RevokeThreadLocalAllocationStack();
+    // Is the thread a daemon?
+    const bool32_t daemon;
 
- private:
+    // A boolean telling us whether we're recursively throwing OOME.
+    bool32_t throwing_OutOfMemoryError;
+
+    // A positive value implies we're in a region where thread suspension isn't expected.
+    uint32_t no_thread_suspension;
+
+    // How many times has our pthread key's destructor been called?
+    uint32_t thread_exit_check_count;
+  } tls32_;
+
+  struct PACKED(8) tls_64bit_sized_values {
+    tls_64bit_sized_values() : trace_clock_base(0), deoptimization_return_value() {
+    }
+
+    // The clock base used for tracing.
+    uint64_t trace_clock_base;
+
+    // Return value used by deoptimization.
+    JValue deoptimization_return_value;
+
+    RuntimeStats stats;
+  } tls64_;
+
+  struct PACKED(4) tls_ptr_sized_values {
+      tls_ptr_sized_values() : card_table(nullptr), exception(nullptr), stack_end(nullptr),
+      managed_stack(), suspend_trigger(nullptr), jni_env(nullptr), self(nullptr), opeer(nullptr),
+      jpeer(nullptr), stack_begin(nullptr), stack_size(0), throw_location(),
+      stack_trace_sample(nullptr), wait_next(nullptr), monitor_enter_object(nullptr),
+      top_sirt(nullptr), class_loader_override(nullptr), long_jump_context(nullptr),
+      instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
+      deoptimization_shadow_frame(nullptr), name(nullptr), pthread_self(0),
+      last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
+      thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+      thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr) {
+    }
+
+    // The biased card table, see CardTable for details.
+    byte* card_table;
+
+    // The pending exception or NULL.
+    mirror::Throwable* exception;
+
+    // The end of this thread's stack. This is the lowest safely-addressable address on the stack.
+    // We leave extra space so there's room for the code that throws StackOverflowError.
+    byte* stack_end;
+
+    // The top of the managed stack often manipulated directly by compiler generated code.
+    ManagedStack managed_stack;
+
+    // In certain modes, setting this to 0 will trigger a SEGV and thus a suspend check.  It is
+    // normally set to the address of itself.
+    uintptr_t* suspend_trigger;
+
+    // Every thread may have an associated JNI environment
+    JNIEnvExt* jni_env;
+
+    // Initialized to "this". On certain architectures (such as x86) reading off of Thread::Current
+    // is easy but getting the address of Thread::Current is hard. This field can be read off of
+    // Thread::Current to give the address.
+    Thread* self;
+
+    // Our managed peer (an instance of java.lang.Thread). The jobject version is used during thread
+    // start up, until the thread is registered and the local opeer_ is used.
+    mirror::Object* opeer;
+    jobject jpeer;
+
+    // The "lowest addressable byte" of the stack.
+    byte* stack_begin;
+
+    // Size of the stack.
+    size_t stack_size;
+
+    // The location the current exception was thrown from.
+    ThrowLocation throw_location;
+
+    // Pointer to previous stack trace captured by sampling profiler.
+    std::vector<mirror::ArtMethod*>* stack_trace_sample;
+
+    // The next thread in the wait set this thread is part of or NULL if not waiting.
+    Thread* wait_next;
+
+    // If we're blocked in MonitorEnter, this is the object we're trying to lock.
+    mirror::Object* monitor_enter_object;
+
+    // Top of linked list of stack indirect reference tables or NULL for none.
+    StackIndirectReferenceTable* top_sirt;
+
+    // Needed to get the right ClassLoader in JNI_OnLoad, but also
+    // useful for testing.
+    mirror::ClassLoader* class_loader_override;
+
+    // Thread local, lazily allocated, long jump context. Used to deliver exceptions.
+    Context* long_jump_context;
+
+    // Additional stack used by method instrumentation to store method and return pc values.
+    // Stored as a pointer since std::deque is not PACKED.
+    std::deque<instrumentation::InstrumentationStackFrame>* instrumentation_stack;
+
+    // JDWP invoke-during-breakpoint support.
+    DebugInvokeReq* debug_invoke_req;
+
+    // JDWP single-stepping support.
+    SingleStepControl* single_step_control;
+
+    // Shadow frame stack that is used temporarily during the deoptimization of a method.
+    ShadowFrame* deoptimization_shadow_frame;
+
+    // A cached copy of the java.lang.Thread's name.
+    std::string* name;
+
+    // A cached pthread_t for the pthread underlying this Thread*.
+    pthread_t pthread_self;
+
+    // Support for Mutex lock hierarchy bug detection.
+    BaseMutex* held_mutexes[kLockLevelCount];
+
+    // If no_thread_suspension_ is > 0, what is causing that assertion.
+    const char* last_no_thread_suspension_cause;
+
+    // Pending checkpoint function or NULL if non-pending. Installation guarding by
+    // Locks::thread_suspend_count_lock_.
+    Closure* checkpoint_functions[kMaxCheckpoints];
+
+    // Entrypoint function pointers.
+    // TODO: move this to more of a global offset table model to avoid per-thread duplication.
+    InterpreterEntryPoints interpreter_entrypoints;
+    JniEntryPoints jni_entrypoints;
+    PortableEntryPoints portable_entrypoints;
+    QuickEntryPoints quick_entrypoints;
+
+    // Thread-local allocation pointer.
+    byte* thread_local_start;
+    byte* thread_local_pos;
+    byte* thread_local_end;
+    size_t thread_local_objects;
+
+    // Thread-local rosalloc runs. There are 34 size brackets in rosalloc
+    // runs (RosAlloc::kNumOfSizeBrackets). We can't refer to the
+    // RosAlloc class due to a header file circular dependency issue.
+    // To compensate, we check that the two values match at RosAlloc
+    // initialization time.
+    void* rosalloc_runs[kRosAllocNumOfSizeBrackets];
+
+    // Thread-local allocation stack data/routines.
+    mirror::Object** thread_local_alloc_stack_top;
+    mirror::Object** thread_local_alloc_stack_end;
+  } tlsPtr_;
+
+  // Guards the 'interrupted_' and 'wait_monitor_' members.
+  Mutex* wait_mutex_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  // Condition variable waited upon during a wait.
+  ConditionVariable* wait_cond_ GUARDED_BY(wait_mutex_);
+  // Pointer to the monitor lock we're currently waiting on or NULL if not waiting.
+  Monitor* wait_monitor_ GUARDED_BY(wait_mutex_);
+
+  // Thread "interrupted" status; stays raised until queried or thrown.
+  bool interrupted_ GUARDED_BY(wait_mutex_);
+
   friend class Dbg;  // For SetStateUnsafe.
   friend class gc::collector::SemiSpace;  // For getting stack traces.
-  friend class Monitor;
-  friend class MonitorInfo;
   friend class Runtime;  // For CreatePeer.
   friend class ScopedThreadStateChange;
   friend class SignalCatcher;  // For SetStateUnsafe.