Modify the code to avoid potential deadlocks.
If the signal to dump a thread is never delivered, then it's
possible for a deadlock. The signal handler is responsible for
unlocking and deleting the ThreadEntry created for the pid/tid
combination. This means if the signal is lost, the ThreadEntry
gets stuck locked and never deleted. If a second attempt to get
a backtrace of this thread occurs, there is a deadlock.
Also, decrease the timeout from 10 seconds to 5 seconds. The original
10 seconds was because the unwind was actually done in the signal
handler. Now the signal handler does nothing but copy the ucontext
structure and let the caller do the unwind.
Bug: 21086132
Change-Id: Idc735dbf6147ec879d35bd4f034c5d227e26a98d
diff --git a/libbacktrace/BacktraceCurrent.cpp b/libbacktrace/BacktraceCurrent.cpp
index 95cd4d1..2714d93 100644
--- a/libbacktrace/BacktraceCurrent.cpp
+++ b/libbacktrace/BacktraceCurrent.cpp
@@ -96,7 +96,7 @@
static void SignalHandler(int, siginfo_t*, void* sigcontext) {
ThreadEntry* entry = ThreadEntry::Get(getpid(), gettid(), false);
if (!entry) {
- BACK_LOGW("Unable to find pid %d tid %d information", getpid(), gettid());
+ BACK_LOGE("pid %d, tid %d entry not found", getpid(), gettid());
return;
}
@@ -109,9 +109,14 @@
// the thread run ahead causing problems.
// The number indicates that we are waiting for the second Wake() call
// overall which is made by the thread requesting an unwind.
- entry->Wait(2);
-
- ThreadEntry::Remove(entry);
+ if (entry->Wait(2)) {
+ // Do not remove the entry here because that can result in a deadlock
+ // if the code cannot properly send a signal to the thread under test.
+ entry->Wake();
+ } else {
+ // At this point, it is possible that entry has been freed, so just exit.
+ BACK_LOGE("Timed out waiting for unwind thread to indicate it completed.");
+ }
}
bool BacktraceCurrent::UnwindThread(size_t num_ignore_frames) {
@@ -128,17 +133,15 @@
act.sa_flags = SA_RESTART | SA_SIGINFO | SA_ONSTACK;
sigemptyset(&act.sa_mask);
if (sigaction(THREAD_SIGNAL, &act, &oldact) != 0) {
- BACK_LOGW("sigaction failed %s", strerror(errno));
- entry->Unlock();
+ BACK_LOGE("sigaction failed: %s", strerror(errno));
ThreadEntry::Remove(entry);
pthread_mutex_unlock(&g_sigaction_mutex);
return false;
}
if (tgkill(Pid(), Tid(), THREAD_SIGNAL) != 0) {
- BACK_LOGW("tgkill %d failed: %s", Tid(), strerror(errno));
+ BACK_LOGE("tgkill %d failed: %s", Tid(), strerror(errno));
sigaction(THREAD_SIGNAL, &oldact, nullptr);
- entry->Unlock();
ThreadEntry::Remove(entry);
pthread_mutex_unlock(&g_sigaction_mutex);
return false;
@@ -146,17 +149,30 @@
// Wait for the thread to get the ucontext. The number indicates
// that we are waiting for the first Wake() call made by the thread.
- entry->Wait(1);
+ bool wait_completed = entry->Wait(1);
// After the thread has received the signal, allow other unwinders to
// continue.
sigaction(THREAD_SIGNAL, &oldact, nullptr);
pthread_mutex_unlock(&g_sigaction_mutex);
- bool unwind_done = UnwindFromContext(num_ignore_frames, entry->GetUcontext());
+ bool unwind_done = false;
+ if (wait_completed) {
+ unwind_done = UnwindFromContext(num_ignore_frames, entry->GetUcontext());
- // Tell the signal handler to exit and release the entry.
- entry->Wake();
+ // Tell the signal handler to exit and release the entry.
+ entry->Wake();
+
+ // Wait for the thread to indicate it is done with the ThreadEntry.
+ if (!entry->Wait(3)) {
+ // Send a warning, but do not mark as a failure to unwind.
+ BACK_LOGW("Timed out waiting for signal handler to indicate it finished.");
+ }
+ } else {
+ BACK_LOGE("Timed out waiting for signal handler to get ucontext data.");
+ }
+
+ ThreadEntry::Remove(entry);
return unwind_done;
}