Reland "Java Heap Profiler for Perfetto." again

This reverts commit 284b1b01cf47335fa578e995150adcd0c6f02059.

Reason for revert: Added some nullptr checks with logging.
                   Fixed behaviour when ReInitializing plugin.

Test: dump heap for system_server

Test: art/tools/run-gtests.sh -j4
[ RUN      ] InstructionSetFeaturesTest.FeaturesFromAssembly
art/runtime/arch/instruction_set_features_test.cc:161: Failure
Value of: assembly_features->HasAtLeast(instruction_set_features.get())
  Actual: false
Expected: true
Assembly features: ISA: Arm Feature string: div,-atomic_ldrd_strd,armv8a
Features from build: ISA: Arm Feature string: div,atomic_ldrd_strd,armv8a
[  FAILED  ] InstructionSetFeaturesTest.FeaturesFromAssembly (0 ms)
Known issue, see http://b/139425971

Test: art/test/testrunner/testrunner.py --target --64
4095/4260 (96%) tests passed.

Test: art/tools/run-libcore-tests.sh --mode=device --variant=X64
Outcomes: 13781. Passed: 13542, Failed: 0, Skipped: 140, Warnings: 99. Took 29m37s.

Test: art/tools/run-jdwp-tests.sh --mode=device --variant=X64
Outcomes: 401. All successful. Took 7m15s.

Bug: 136210868

Change-Id: I7b8564c93926788acb0340de254836dff0ebd9f8
diff --git a/Android.mk b/Android.mk
index 1904f65..64d72c1 100644
--- a/Android.mk
+++ b/Android.mk
@@ -426,6 +426,7 @@
     libopenjdkjvmti \
     profman \
     libadbconnection \
+    libperfetto_hprof \
 
 # Potentially add in debug variants:
 #
@@ -448,6 +449,7 @@
     libopenjdkjvmtid \
     profmand \
     libadbconnectiond \
+    libperfetto_hprofd \
 
 endif
 endif
diff --git a/build/apex/Android.bp b/build/apex/Android.bp
index a7366c9..284cc18 100644
--- a/build/apex/Android.bp
+++ b/build/apex/Android.bp
@@ -40,6 +40,11 @@
     "libopenjdkjvm",
     "libopenjdkjvmti",
 ]
+
+art_runtime_base_native_device_only_shared_libs = [
+    "libperfetto_hprof",
+]
+
 bionic_native_shared_libs = [
     // External API (having APEX stubs).
     "libc",
@@ -81,6 +86,10 @@
     "libopenjdkjvmtid",
 ]
 
+art_runtime_base_native_device_only_debug_shared_libs = [
+  "libperfetto_hprofd",
+]
+
 // Tools common to both device APEX and host APEX. Derived from art-tools in art/Android.mk.
 art_tools_common_binaries = [
     "dexdump",
@@ -248,6 +257,7 @@
     manifest: "manifest-art.json",
     java_libs: libcore_java_libs,
     native_shared_libs: art_runtime_base_native_shared_libs +
+        art_runtime_base_native_device_only_shared_libs +
         libcore_native_device_only_shared_libs +
         libcore_native_shared_libs,
     multilib: {
@@ -281,7 +291,8 @@
 apex_defaults {
     name: "com.android.art-dev-defaults",
     defaults: ["com.android.art-defaults"],
-    native_shared_libs: art_runtime_debug_native_shared_libs +
+    native_shared_libs: art_runtime_base_native_device_only_debug_shared_libs +
+        art_runtime_debug_native_shared_libs +
         libcore_debug_native_shared_libs,
     multilib: {
         both: {
diff --git a/build/apex/art_apex_test.py b/build/apex/art_apex_test.py
index 7a992e6..2b42404 100755
--- a/build/apex/art_apex_test.py
+++ b/build/apex/art_apex_test.py
@@ -528,6 +528,7 @@
 
     # Check internal libraries for ART.
     self._checker.check_prefer64_library('libart-disassembler')
+    self._checker.check_native_library('libperfetto_hprof')
 
     # Check exported native libraries for Managed Core Library.
     self._checker.check_native_library('libandroidicu')
@@ -621,6 +622,7 @@
     # Check ART internal libraries.
     self._checker.check_native_library('libdexfiled_external')
     self._checker.check_prefer64_library('libartd-disassembler')
+    self._checker.check_native_library('libperfetto_hprofd')
 
     # Check internal native library dependencies.
     #
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index f2ae996..b0a7f1f 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -122,12 +122,13 @@
     }
     if (!started) {
       // Runtime isn't started. We only expect at most the signal handler or JIT threads to be
-      // started here.
+      // started here; this includes the hprof_listener signal handler thread for perfetto_hprof.
       if (art::kIsDebugBuild) {
         std::string name;
         self->GetThreadName(name);
         if (name != "JDWP" &&
             name != "Signal Catcher" &&
+            name != "hprof_listener" &&
             !android::base::StartsWith(name, "Jit thread pool") &&
             !android::base::StartsWith(name, "Runtime worker thread")) {
           LOG(FATAL) << "Unexpected thread before start: " << name << " id: "
diff --git a/perfetto_hprof/Android.bp b/perfetto_hprof/Android.bp
new file mode 100644
index 0000000..5fd7721
--- /dev/null
+++ b/perfetto_hprof/Android.bp
@@ -0,0 +1,88 @@
+// Copyright (C) 2019 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// Build variants {target} x {debug,ndebug} x {32,64}
+
+// This depends on the Perfetto client API. This uses the ProducerPort to
+// communicate to the system trace. This is an API whose ABI is maintained
+// to be backwards compatible, see
+// https://android.googlesource.com/platform/external/perfetto/+/refs/heads/master/protos/perfetto/ipc/producer_port.proto.
+
+gensrcs {
+    name: "art_perfetto_hprof_operator_srcs",
+    cmd: "$(location generate_operator_out) art/perfetto_hprof $(in) > $(out)",
+    tools: ["generate_operator_out"],
+    srcs: [
+        "perfetto_hprof.h",
+    ],
+    output_extension: "operator_out.cc",
+}
+
+cc_defaults {
+    name: "perfetto_hprof-defaults",
+    host_supported: false,
+    srcs: ["perfetto_hprof.cc"],
+    defaults: ["art_defaults"],
+    include_dirs: [
+        "external/perfetto/include",
+    ],
+
+    // Note that this tool needs to be built for both 32-bit and 64-bit since it requires
+    // to be same ISA as what it is attached to.
+    compile_multilib: "both",
+
+    shared_libs: [
+        "libbase",
+        "liblog",
+    ],
+    static_libs: [
+        "libperfetto_client_experimental",
+        "perfetto_src_tracing_ipc",
+        "perfetto_trace_protos",
+        // TODO(132880619): Remove this as soon as the Perfetto client API no
+        // longer depends on this.
+        "libprotobuf-cpp-lite",
+    ],
+    target: {
+        darwin: {
+            enabled: false,
+        },
+    },
+    header_libs: [
+        "libnativehelper_header_only",
+    ],
+    generated_sources: ["art_perfetto_hprof_operator_srcs"],
+}
+
+art_cc_library {
+    name: "libperfetto_hprof",
+    defaults: ["perfetto_hprof-defaults"],
+    shared_libs: [
+        "libart",
+        "libartbase",
+    ],
+}
+
+art_cc_library {
+    name: "libperfetto_hprofd",
+    defaults: [
+        "art_debug_defaults",
+        "perfetto_hprof-defaults",
+    ],
+    shared_libs: [
+        "libartd",
+        "libartbased",
+    ],
+}
diff --git a/perfetto_hprof/perfetto_hprof.cc b/perfetto_hprof/perfetto_hprof.cc
new file mode 100644
index 0000000..078ac76
--- /dev/null
+++ b/perfetto_hprof/perfetto_hprof.cc
@@ -0,0 +1,445 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "perfetto_hprof"
+
+#include "perfetto_hprof.h"
+
+#include <android-base/logging.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <thread>
+
+#include "gc/heap-visit-objects-inl.h"
+#include "gc/heap.h"
+#include "gc/scoped_gc_critical_section.h"
+#include "mirror/object-refvisitor-inl.h"
+#include "nativehelper/scoped_local_ref.h"
+#include "perfetto/trace/interned_data/interned_data.pbzero.h"
+#include "perfetto/trace/profiling/heap_graph.pbzero.h"
+#include "perfetto/trace/profiling/profile_common.pbzero.h"
+#include "perfetto/tracing.h"
+#include "runtime-inl.h"
+#include "runtime_callbacks.h"
+#include "scoped_thread_state_change-inl.h"
+#include "thread_list.h"
+#include "well_known_classes.h"
+
+// There are three threads involved in this:
+// * listener thread: this is idle in the background when this plugin gets loaded, and waits
+//   for data on on g_signal_pipe_fds.
+// * signal thread: an arbitrary thread that handles the signal and writes data to
+//   g_signal_pipe_fds.
+// * perfetto producer thread: once the signal is received, the app forks. In the newly forked
+//   child, the Perfetto Client API spawns a thread to communicate with traced.
+
+namespace perfetto_hprof {
+
+constexpr int kJavaHeapprofdSignal = __SIGRTMIN + 6;
+constexpr time_t kWatchdogTimeoutSec = 120;
+constexpr size_t kObjectsPerPacket = 100;
+constexpr char kByte[1] = {'x'};
+static art::Mutex& GetStateMutex() {
+  static art::Mutex state_mutex("perfetto_hprof_state_mutex", art::LockLevel::kGenericBottomLock);
+  return state_mutex;
+}
+
+static art::ConditionVariable& GetStateCV() {
+  static art::ConditionVariable state_cv("perfetto_hprof_state_cv", GetStateMutex());
+  return state_cv;
+}
+
+static State g_state = State::kUninitialized;
+
+// Pipe to signal from the signal handler into a worker thread that handles the
+// dump requests.
+int g_signal_pipe_fds[2];
+static struct sigaction g_orig_act = {};
+
+uint64_t FindOrAppend(std::map<std::string, uint64_t>* m,
+                      const std::string& s) {
+  auto it = m->find(s);
+  if (it == m->end()) {
+    std::tie(it, std::ignore) = m->emplace(s, m->size());
+  }
+  return it->second;
+}
+
+void ArmWatchdogOrDie() {
+  timer_t timerid{};
+  struct sigevent sev {};
+  sev.sigev_notify = SIGEV_SIGNAL;
+  sev.sigev_signo = SIGKILL;
+
+  if (timer_create(CLOCK_MONOTONIC, &sev, &timerid) == -1) {
+    // This only gets called in the child, so we can fatal without impacting
+    // the app.
+    PLOG(FATAL) << "failed to create watchdog timer";
+  }
+
+  struct itimerspec its {};
+  its.it_value.tv_sec = kWatchdogTimeoutSec;
+
+  if (timer_settime(timerid, 0, &its, nullptr) == -1) {
+    // This only gets called in the child, so we can fatal without impacting
+    // the app.
+    PLOG(FATAL) << "failed to arm watchdog timer";
+  }
+}
+
+class JavaHprofDataSource : public perfetto::DataSource<JavaHprofDataSource> {
+ public:
+  // TODO(fmayer): Change Client API and reject configs that do not target
+  // this process.
+  void OnSetup(const SetupArgs&) override {}
+
+  void OnStart(const StartArgs&) override {
+    art::MutexLock lk(art_thread(), GetStateMutex());
+    if (g_state == State::kWaitForStart) {
+      g_state = State::kStart;
+      GetStateCV().Broadcast(art_thread());
+    }
+  }
+
+  void OnStop(const StopArgs&) override {}
+
+  static art::Thread* art_thread() {
+    // TODO(fmayer): Attach the Perfetto producer thread to ART and give it a name. This is
+    // not trivial, we cannot just attach the first time this method is called, because
+    // AttachCurrentThread deadlocks with the ConditionVariable::Wait in WaitForDataSource.
+    //
+    // We should attach the thread as soon as the Client API spawns it, but that needs more
+    // complicated plumbing.
+    return nullptr;
+  }
+
+ private:
+  static art::Thread* self_;
+};
+
+art::Thread* JavaHprofDataSource::self_ = nullptr;
+
+
+void WaitForDataSource(art::Thread* self) {
+  perfetto::TracingInitArgs args;
+  args.backends = perfetto::BackendType::kSystemBackend;
+  perfetto::Tracing::Initialize(args);
+
+  perfetto::DataSourceDescriptor dsd;
+  dsd.set_name("android.java_hprof");
+  JavaHprofDataSource::Register(dsd);
+
+  LOG(INFO) << "waiting for data source";
+
+  art::MutexLock lk(self, GetStateMutex());
+  while (g_state != State::kStart) {
+    GetStateCV().Wait(self);
+  }
+}
+
+class Writer {
+ public:
+  Writer(pid_t parent_pid, JavaHprofDataSource::TraceContext* ctx)
+      : parent_pid_(parent_pid), ctx_(ctx) {}
+
+  perfetto::protos::pbzero::HeapGraph* GetHeapGraph() {
+    if (!heap_graph_ || ++objects_written_ % kObjectsPerPacket == 0) {
+      if (heap_graph_) {
+        heap_graph_->set_continued(true);
+      }
+      Finalize();
+
+      trace_packet_ = ctx_->NewTracePacket();
+      heap_graph_ = trace_packet_->set_heap_graph();
+      heap_graph_->set_pid(parent_pid_);
+      heap_graph_->set_index(index_++);
+    }
+    return heap_graph_;
+  }
+
+  void Finalize() {
+    if (trace_packet_) {
+      trace_packet_->Finalize();
+    }
+    heap_graph_ = nullptr;
+  }
+
+  ~Writer() { Finalize(); }
+
+ private:
+  const pid_t parent_pid_;
+  JavaHprofDataSource::TraceContext* const ctx_;
+
+  perfetto::DataSource<JavaHprofDataSource>::TraceContext::TracePacketHandle
+      trace_packet_;
+  perfetto::protos::pbzero::HeapGraph* heap_graph_ = nullptr;
+
+  uint64_t index_ = 0;
+  size_t objects_written_ = 0;
+};
+
+class ReferredObjectsFinder {
+ public:
+  explicit ReferredObjectsFinder(
+      std::vector<std::pair<std::string, art::mirror::Object*>>* referred_objects)
+      REQUIRES_SHARED(art::Locks::mutator_lock_)
+      : referred_objects_(referred_objects) {}
+
+  // For art::mirror::Object::VisitReferences.
+  void operator()(art::ObjPtr<art::mirror::Object> obj, art::MemberOffset offset,
+                  bool is_static) const
+      REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    art::mirror::Object* ref = obj->GetFieldObject<art::mirror::Object>(offset);
+    art::ArtField* field;
+    if (is_static) {
+      field = art::ArtField::FindStaticFieldWithOffset(obj->AsClass(), offset.Uint32Value());
+    } else {
+      field = art::ArtField::FindInstanceFieldWithOffset(obj->GetClass(), offset.Uint32Value());
+    }
+    std::string field_name = "";
+    if (field != nullptr) {
+      field_name = field->PrettyField(/*with_type=*/false);
+    }
+    referred_objects_->emplace_back(std::move(field_name), ref);
+  }
+
+  void VisitRootIfNonNull(art::mirror::CompressedReference<art::mirror::Object>* root
+                              ATTRIBUTE_UNUSED) const {}
+  void VisitRoot(art::mirror::CompressedReference<art::mirror::Object>* root
+                     ATTRIBUTE_UNUSED) const {}
+
+ private:
+  // We can use a raw Object* pointer here, because there are no concurrent GC threads after the
+  // fork.
+  std::vector<std::pair<std::string, art::mirror::Object*>>* referred_objects_;
+};
+
+void DumpPerfetto(art::Thread* self) {
+  pid_t parent_pid = getpid();
+  LOG(INFO) << "preparing to dump heap for " << parent_pid;
+
+  // Need to take a heap dump while GC isn't running. See the comment in
+  // Heap::VisitObjects(). Also we need the critical section to avoid visiting
+  // the same object twice. See b/34967844.
+  //
+  // We need to do this before the fork, because otherwise it can deadlock
+  // waiting for the GC, as all other threads get terminated by the clone, but
+  // their locks are not released.
+  art::gc::ScopedGCCriticalSection gcs(self, art::gc::kGcCauseHprof,
+                                       art::gc::kCollectorTypeHprof);
+
+  art::ScopedSuspendAll ssa(__FUNCTION__, /* long_suspend=*/ true);
+
+  pid_t pid = fork();
+  if (pid != 0) {
+    return;
+  }
+
+  // Make sure that this is the first thing we do after forking, so if anything
+  // below hangs, the fork will go away from the watchdog.
+  ArmWatchdogOrDie();
+
+  WaitForDataSource(self);
+
+  JavaHprofDataSource::Trace(
+      [parent_pid](JavaHprofDataSource::TraceContext ctx)
+          NO_THREAD_SAFETY_ANALYSIS {
+            LOG(INFO) << "dumping heap for " << parent_pid;
+            Writer writer(parent_pid, &ctx);
+            // Make sure that intern ID 0 (default proto value for a uint64_t) always maps to ""
+            // (default proto value for a string).
+            std::map<std::string, uint64_t> interned_fields{{"", 0}};
+            std::map<std::string, uint64_t> interned_types{{"", 0}};
+
+            art::Runtime::Current()->GetHeap()->VisitObjectsPaused(
+                [&writer, &interned_types, &interned_fields](
+                    art::mirror::Object* obj) REQUIRES_SHARED(art::Locks::mutator_lock_) {
+                  perfetto::protos::pbzero::HeapGraphObject* object_proto =
+                    writer.GetHeapGraph()->add_objects();
+                  object_proto->set_id(reinterpret_cast<uintptr_t>(obj));
+                  object_proto->set_type_id(
+                      FindOrAppend(&interned_types, obj->PrettyTypeOf()));
+                  object_proto->set_self_size(obj->SizeOf());
+
+                  std::vector<std::pair<std::string, art::mirror::Object*>>
+                      referred_objects;
+                  ReferredObjectsFinder objf(&referred_objects);
+                  obj->VisitReferences(objf, art::VoidFunctor());
+                  for (const auto& p : referred_objects) {
+                    object_proto->add_reference_field_id(
+                        FindOrAppend(&interned_fields, p.first));
+                    object_proto->add_reference_object_id(
+                        reinterpret_cast<uintptr_t>(p.second));
+                  }
+                });
+
+            for (const auto& p : interned_fields) {
+              const std::string& str = p.first;
+              uint64_t id = p.second;
+
+              perfetto::protos::pbzero::InternedString* field_proto =
+                writer.GetHeapGraph()->add_field_names();
+              field_proto->set_iid(id);
+              field_proto->set_str(
+                  reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
+            }
+            for (const auto& p : interned_types) {
+              const std::string& str = p.first;
+              uint64_t id = p.second;
+
+              perfetto::protos::pbzero::InternedString* type_proto =
+                writer.GetHeapGraph()->add_type_names();
+              type_proto->set_iid(id);
+              type_proto->set_str(reinterpret_cast<const uint8_t*>(str.c_str()),
+                                  str.size());
+            }
+
+            writer.Finalize();
+
+            ctx.Flush([] {
+              {
+                art::MutexLock lk(JavaHprofDataSource::art_thread(), GetStateMutex());
+                g_state = State::kEnd;
+                GetStateCV().Broadcast(JavaHprofDataSource::art_thread());
+              }
+            });
+          });
+
+  art::MutexLock lk(self, GetStateMutex());
+  while (g_state != State::kEnd) {
+    GetStateCV().Wait(self);
+  }
+  LOG(INFO) << "finished dumping heap for " << parent_pid;
+  // Prevent the atexit handlers to run. We do not want to call cleanup
+  // functions the parent process has registered.
+  _exit(0);
+}
+
+// The plugin initialization function.
+extern "C" bool ArtPlugin_Initialize() {
+  if (art::Runtime::Current() == nullptr) {
+    return false;
+  }
+  art::Thread* self = art::Thread::Current();
+  {
+    art::MutexLock lk(self, GetStateMutex());
+    if (g_state != State::kUninitialized) {
+      LOG(ERROR) << "perfetto_hprof already initialized. state: " << g_state;
+      return false;
+    }
+    g_state = State::kWaitForListener;
+  }
+
+  if (pipe(g_signal_pipe_fds) == -1) {
+    PLOG(ERROR) << "Failed to pipe";
+    return false;
+  }
+
+  struct sigaction act = {};
+  act.sa_sigaction = [](int, siginfo_t*, void*) {
+    if (write(g_signal_pipe_fds[1], kByte, sizeof(kByte)) == -1) {
+      PLOG(ERROR) << "Failed to trigger heap dump";
+    }
+  };
+
+  // TODO(fmayer): We can probably use the SignalCatcher thread here to not
+  // have an idle thread.
+  if (sigaction(kJavaHeapprofdSignal, &act, &g_orig_act) != 0) {
+    close(g_signal_pipe_fds[0]);
+    close(g_signal_pipe_fds[1]);
+    PLOG(ERROR) << "Failed to sigaction";
+    return false;
+  }
+
+  std::thread th([] {
+    art::Runtime* runtime = art::Runtime::Current();
+    if (!runtime) {
+      LOG(FATAL_WITHOUT_ABORT) << "no runtime in hprof_listener";
+      return;
+    }
+    if (!runtime->AttachCurrentThread("hprof_listener", /*as_daemon=*/ true,
+                                      runtime->GetSystemThreadGroup(), /*create_peer=*/ false)) {
+      LOG(ERROR) << "failed to attach thread.";
+      return;
+    }
+    art::Thread* self = art::Thread::Current();
+    if (!self) {
+      LOG(FATAL_WITHOUT_ABORT) << "no thread in hprof_listener";
+      return;
+    }
+    {
+      art::MutexLock lk(self, GetStateMutex());
+      if (g_state == State::kWaitForListener) {
+        g_state = State::kWaitForStart;
+        GetStateCV().Broadcast(self);
+      }
+    }
+    char buf[1];
+    for (;;) {
+      int res;
+      do {
+        res = read(g_signal_pipe_fds[0], buf, sizeof(buf));
+      } while (res == -1 && errno == EINTR);
+
+      if (res <= 0) {
+        if (res == -1) {
+          PLOG(ERROR) << "failed to read";
+        }
+        close(g_signal_pipe_fds[0]);
+        return;
+      }
+
+      perfetto_hprof::DumpPerfetto(self);
+    }
+  });
+  th.detach();
+
+  art::MutexLock lk(art::Thread::Current(), GetStateMutex());
+  while (g_state == State::kWaitForListener) {
+    GetStateCV().Wait(art::Thread::Current());
+  }
+  return true;
+}
+
+extern "C" bool ArtPlugin_Deinitialize() {
+  if (sigaction(kJavaHeapprofdSignal, &g_orig_act, nullptr) != 0) {
+    PLOG(ERROR) << "failed to reset signal handler";
+    // We cannot close the pipe if the signal handler wasn't unregistered,
+    // to avoid receiving SIGPIPE.
+    return false;
+  }
+  close(g_signal_pipe_fds[1]);
+
+  art::Thread* self = art::Thread::Current();
+  art::MutexLock lk(self, GetStateMutex());
+  if (g_state != State::kWaitForListener) {
+    g_state = State::kUninitialized;
+    GetStateCV().Broadcast(self);
+  }
+  return true;
+}
+
+}  // namespace perfetto_hprof
+
+namespace perfetto {
+
+PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(perfetto_hprof::JavaHprofDataSource);
+
+}
diff --git a/perfetto_hprof/perfetto_hprof.h b/perfetto_hprof/perfetto_hprof.h
new file mode 100644
index 0000000..1713286
--- /dev/null
+++ b/perfetto_hprof/perfetto_hprof.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_PERFETTO_HPROF_PERFETTO_HPROF_H_
+#define ART_PERFETTO_HPROF_PERFETTO_HPROF_H_
+
+#include <ostream>
+
+namespace perfetto_hprof {
+
+enum class State {
+  // Worker thread not spawned.
+  kUninitialized,
+  // Worker thread spawned, waiting for ACK.
+  kWaitForListener,
+  // Worker thread ready, waiting for data-source.
+  kWaitForStart,
+  // These are only in the forked process:
+  // Data source received, start dump.
+  kStart,
+  // Dump finished. Kill forked child.
+  kEnd,
+};
+
+std::ostream& operator<<(std::ostream&, const State&);
+
+}  // namespace perfetto_hprof
+
+#endif  // ART_PERFETTO_HPROF_PERFETTO_HPROF_H_
diff --git a/runtime/native/dalvik_system_ZygoteHooks.cc b/runtime/native/dalvik_system_ZygoteHooks.cc
index ca331df..156895d 100644
--- a/runtime/native/dalvik_system_ZygoteHooks.cc
+++ b/runtime/native/dalvik_system_ZygoteHooks.cc
@@ -149,6 +149,7 @@
   HIDDEN_API_ENFORCEMENT_POLICY_MASK = (1 << 12)
                                      | (1 << 13),
   PROFILE_SYSTEM_SERVER              = 1 << 14,
+  PROFILE_FROM_SHELL                 = 1 << 15,
   USE_APP_IMAGE_STARTUP_CACHE        = 1 << 16,
   DEBUG_IGNORE_APP_SIGNAL_HANDLER    = 1 << 17,
 
@@ -241,6 +242,9 @@
     runtime_flags &= ~DEBUG_IGNORE_APP_SIGNAL_HANDLER;
   }
 
+  runtime->SetProfileableFromShell((runtime_flags & PROFILE_FROM_SHELL) != 0);
+  runtime_flags &= ~PROFILE_FROM_SHELL;
+
   return runtime_flags;
 }
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index e7ce33a..c5d87fc 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1027,6 +1027,14 @@
   // this to come last.
   ScopedObjectAccess soa(Thread::Current());
   GetRuntimeCallbacks()->StartDebugger();
+
+  if (Dbg::IsJdwpAllowed() || IsProfileableFromShell() || IsJavaDebuggable()) {
+    std::string err;
+    ScopedThreadSuspension sts(Thread::Current(), ThreadState::kNative);
+    if (!EnsurePerfettoPlugin(&err)) {
+      LOG(WARNING) << "Failed to load perfetto_hprof: " << err;
+    }
+  }
 }
 
 void Runtime::StartSignalCatcher() {
@@ -1765,18 +1773,30 @@
   return true;
 }
 
-static bool EnsureJvmtiPlugin(Runtime* runtime,
-                              std::vector<Plugin>* plugins,
-                              std::string* error_msg) {
-  constexpr const char* plugin_name = kIsDebugBuild ? "libopenjdkjvmtid.so" : "libopenjdkjvmti.so";
-
+bool Runtime::EnsurePluginLoaded(const char* plugin_name, std::string* error_msg) {
   // Is the plugin already loaded?
-  for (const Plugin& p : *plugins) {
+  for (const Plugin& p : plugins_) {
     if (p.GetLibrary() == plugin_name) {
       return true;
     }
   }
+  Plugin new_plugin = Plugin::Create(plugin_name);
 
+  if (!new_plugin.Load(error_msg)) {
+    return false;
+  }
+  plugins_.push_back(std::move(new_plugin));
+  return true;
+}
+
+bool Runtime::EnsurePerfettoPlugin(std::string* error_msg) {
+  constexpr const char* plugin_name = kIsDebugBuild ?
+    "libperfetto_hprofd.so" : "libperfetto_hprof.so";
+  return EnsurePluginLoaded(plugin_name, error_msg);
+}
+
+static bool EnsureJvmtiPlugin(Runtime* runtime,
+                              std::string* error_msg) {
   // TODO Rename Dbg::IsJdwpAllowed is IsDebuggingAllowed.
   DCHECK(Dbg::IsJdwpAllowed() || !runtime->IsJavaDebuggable())
       << "Being debuggable requires that jdwp (i.e. debugging) is allowed.";
@@ -1787,14 +1807,8 @@
     return false;
   }
 
-  Plugin new_plugin = Plugin::Create(plugin_name);
-
-  if (!new_plugin.Load(error_msg)) {
-    return false;
-  }
-
-  plugins->push_back(std::move(new_plugin));
-  return true;
+  constexpr const char* plugin_name = kIsDebugBuild ? "libopenjdkjvmtid.so" : "libopenjdkjvmti.so";
+  return runtime->EnsurePluginLoaded(plugin_name, error_msg);
 }
 
 // Attach a new agent and add it to the list of runtime agents
@@ -1805,7 +1819,7 @@
 //
 void Runtime::AttachAgent(JNIEnv* env, const std::string& agent_arg, jobject class_loader) {
   std::string error_msg;
-  if (!EnsureJvmtiPlugin(this, &plugins_, &error_msg)) {
+  if (!EnsureJvmtiPlugin(this, &error_msg)) {
     LOG(WARNING) << "Could not load plugin: " << error_msg;
     ScopedObjectAccess soa(Thread::Current());
     ThrowIOException("%s", error_msg.c_str());
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 6735216..120ca66 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -125,6 +125,9 @@
   static bool Create(const RuntimeOptions& raw_options, bool ignore_unrecognized)
       SHARED_TRYLOCK_FUNCTION(true, Locks::mutator_lock_);
 
+  bool EnsurePluginLoaded(const char* plugin_name, std::string* error_msg);
+  bool EnsurePerfettoPlugin(std::string* error_msg);
+
   // IsAotCompiler for compilers that don't have a running runtime. Only dex2oat currently.
   bool IsAotCompiler() const {
     return !UseJitCompilation() && IsCompiler();
@@ -691,6 +694,14 @@
     return is_java_debuggable_;
   }
 
+  void SetProfileableFromShell(bool value) {
+    is_profileable_from_shell_ = value;
+  }
+
+  bool IsProfileableFromShell() const {
+    return is_profileable_from_shell_;
+  }
+
   void SetJavaDebuggable(bool value);
 
   // Deoptimize the boot image, called for Java debuggable apps.
@@ -1157,6 +1168,8 @@
   // Whether Java code needs to be debuggable.
   bool is_java_debuggable_;
 
+  bool is_profileable_from_shell_ = false;
+
   // The maximum number of failed boots we allow before pruning the dalvik cache
   // and trying again. This option is only inspected when we're running as a
   // zygote.