0%

Android tombstone分析

前言

Android 进程在发生异常 Crash 时,debuggerd 会在 /data/tombstone 目录下生成对应的 tombstone 文件,保存该进程的基本信息、调用栈信息以及stack等。

tombstone

以笔者最近遇到的问题为例,截取 tombstone 文件部分信息如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
pid: 3148, tid: 3365, name: TimeCheckThread  >>> /system/bin/audioserver <<<
signal 6 (SIGABRT), code -6 (SI_TKILL), fault addr --------
Abort message: 'TimeCheck timeout for IAudioFlinger'
r0 00000000 r1 00000d25 r2 00000006 r3 ea39d918
r4 00000c4c r5 00000d25 r6 e7d424a4 r7 0000010c
r8 e91edcfa r9 e8a11f78 r10 3b9aca00 r11 e959bd69
ip 7fffffff sp e7d42490 lr ea32c0e5 pc ea323e72

backtrace:
#00 pc 0001ce72 /system/lib/libc.so (abort+62)
#01 pc 00006dd5 /system/lib/liblog.so (__android_log_assert+156)
#02 pc 0000e307 /system/lib/libmedia_helper.so (android::TimeCheck::TimeCheckThread::threadLoop()+270)
#03 pc 0000c1bf /system/lib/libutils.so (android::Thread::_threadLoop(void*)+286)
#04 pc 00063c15 /system/lib/libc.so (__pthread_start(void*)+22)
#05 pc 0001e065 /system/lib/libc.so (__start_thread+22)

stack:
e7d42450 e8a11f78 [anon:libc_malloc]
e7d42454 3b9aca00
e7d42458 e959bd69 /system/lib/libutils.so (_ZN13thread_data_t10trampolineEPKS_)
e7d4245c e8cd4b6d /system/lib/liblog.so (__android_log_buf_write+252)
e7d42460 000003dc
e7d42464 ffff0208 [vectors]
e7d42468 00000007
e7d4246c 000003ff
e7d42470 7fffffdf
e7d42474 fffffff8
e7d42478 00000000
e7d4247c 6a6b4bdd
e7d42480 00000c4c
e7d42484 00000d25
e7d42488 e7d424a4 <anonymous:e7c46000>
e7d4248c ea323e63 /system/lib/libc.so (abort+46)
#00 e7d42490 00000001
........ ........
#01 e7d424c0 ea0d81a9 /system/lib/libmedia_helper.so
........ ........
#02 e7d428f0 e8a0b908 [anon:libc_malloc]
........ ........
#03 e7d42920 e7d4291c <anonymous:e7c46000>
........ ........
#04 e7d42958 ea36abff /system/lib/libc.so (_ZL15__pthread_startPv)
........ ........
#05 e7d42960 e7d42978 <anonymous:e7c46000>
e7d42964 ea36abff /system/lib/libc.so (_ZL15__pthread_startPv)
e7d42968 e7d42970 <anonymous:e7c46000>
e7d4296c 00000000
e7d42970 e807f970 <anonymous:e7f83000>
e7d42974 e7c44970 <anonymous:e7b48000>
e7d42978 ffffffff
e7d4297c 00000000
e7d42980 00000001
e7d42984 e7c45000 [anon:thread stack guard]
e7d42988 000fd970
e7d4298c 00001000
e7d42990 00000000
e7d42994 00000000
e7d42998 00000003
e7d4299c 00000000

文件记录了发生问题的进程信息,因为 tid 和 pid 并不一致,这里是 audioserver 的线程,而提示出现问题的原因是 SIGABORT。

这里保存了进程异常时的寄存器和栈回溯信息,定位到异常现场是 TimeCheckThread 线程中触发了 SIGABORT,那么如何进一步排查呢?这里需要借助别的工具。

addr2line

addr2line 用来获得动态库或者可执行文件中指定地址对应的源码信息,可以快速定位到出现问题的代码。

  • #05 pc 0001e065 /system/lib/libc.so (__start_thread+22)

首先看上面这条信息,可以通过 addr2line 工具获取 libc.so 动态库中 0001e065 地址对应的源代码行数。

这里需要注意不要使用 out/target/product/xxx/system/lib/libc.so, 因为这里的so都是经过 strip 之后生成的,去除了很多调试信息,所以直接使用 out/target/product/xxx/symbols/system/lib/libc.so 文件解析。

1
2
3
xujinlong@dagobah:~/android$ ./prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin/arm-linux-androideabi-addr2line -f -e out/target/product/xxx/symbols/system/lib/libc.so 0001e065
__start_thread
bionic/libc/bionic/clone.cpp:52

对应代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
43 // Called from the __bionic_clone assembler to call the thread function then exit.
44 extern "C" __LIBC_HIDDEN__ void __start_thread(int (*fn)(void*), void* arg) {
45 BIONIC_STOP_UNWIND;
46
47 pthread_internal_t* self = __get_thread();
48 if (self && self->tid == -1) {
49 self->tid = syscall(__NR_gettid);
50 }
51
52 int status = (*fn)(arg);
53 __exit(status);
54 }
  • #04 pc 00063c15 /system/lib/libc.so (__pthread_start(void*)+22)
1
2
3
xujinlong@dagobah:~/s905d3$ ./prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin/arm-linux-androideabi-addr2line -f -e out/target/product/xxx/symbols/system/lib/libc.so 00063c15
_ZL15__pthread_startPv
bionic/libc/bionic/pthread_create.cpp:254

对应的代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
243 static int __pthread_start(void* arg) {
244 pthread_internal_t* thread = reinterpret_cast<pthread_internal_t*>(arg);
245
246 // Wait for our creating thread to release us. This lets it have time to
247 // notify gdb about this thread before we start doing anything.
248 // This also provides the memory barrier needed to ensure that all memory
249 // accesses previously made by the creating thread are visible to us.
250 thread->startup_handshake_lock.lock();
251
252 __init_alternate_signal_stack(thread);
253
254 void* result = thread->start_routine(thread->start_routine_arg);
255 pthread_exit(result);
256
257 return 0;
258 }
  • #03 pc 0000c1bf /system/lib/libutils.so (android::Thread::_threadLoop(void*)+286)
1
2
3
xujinlong@dagobah:~/s905d3$ ./prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin/arm-linux-androideabi-addr2line -f -e out/target/product/xxx/symbols/system/lib/libutils.so 0000c1bf
_ZN7android6Thread11_threadLoopEPv
system/core/libutils/Threads.cpp:747

对应的函数如下,执行到 747 行代码,调用 self->threadLoop()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
711 int Thread::_threadLoop(void* user)
712 {
713 Thread* const self = static_cast<Thread*>(user);
714
715 sp<Thread> strong(self->mHoldSelf);
716 wp<Thread> weak(strong);
717 self->mHoldSelf.clear();
718
719 #if defined(__ANDROID__)
720 // this is very useful for debugging with gdb
721 self->mTid = gettid();
722 #endif
723
724 bool first = true;
725
726 do {
727 bool result;
728 if (first) {
729 first = false;
730 self->mStatus = self->readyToRun();
731 result = (self->mStatus == NO_ERROR);
732
733 if (result && !self->exitPending()) {
734 // Binder threads (and maybe others) rely on threadLoop
735 // running at least once after a successful ::readyToRun()
736 // (unless, of course, the thread has already been asked to exit
737 // at that point).
738 // This is because threads are essentially used like this:
739 // (new ThreadSubclass())->run();
740 // The caller therefore does not retain a strong reference to
741 // the thread and the thread would simply disappear after the
742 // successful ::readyToRun() call instead of entering the
743 // threadLoop at least once.
744 result = self->threadLoop();
745 }
746 } else {
747 result = self->threadLoop();
748 }
749
750 // establish a scope for mLock
751 {
752 Mutex::Autolock _l(self->mLock);
753 if (result == false || self->mExitPending) {
754 self->mExitPending = true;
755 self->mRunning = false;
756 // clear thread ID so that requestExitAndWait() does not exit if
757 // called by a new thread using the same thread ID as this one.
758 self->mThread = thread_id_t(-1);
759 // note that interested observers blocked in requestExitAndWait are
760 // awoken by broadcast, but blocked on mLock until break exits scope
761 self->mThreadExitedCondition.broadcast();
762 break;
763 }
764 }
765
766 // Release our strong reference, to let a chance to the thread
767 // to die a peaceful death.
768 strong.clear();
769 // And immediately, re-acquire a strong reference for the next loop
770 strong = weak.promote();
771 } while(strong != 0);
772
773 return 0;
774 }
  • #02 pc 0000e307 /system/lib/libmedia_helper.so (android::TimeCheck::TimeCheckThread::threadLoop()+270)
1
2
3
xujinlong@dagobah:~/s905d3$ ./prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin/arm-linux-androideabi-addr2line -f -e out/target/product/xxx/symbols/system/lib/libmedia_helper.so 0000e307
_ZN7android9TimeCheck15TimeCheckThread10threadLoopEv
frameworks/av/media/libmedia/TimeCheck.cpp:84

对应代码如下,调用 LOG_ALWAYS_FATAL_IF 函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
61 bool TimeCheck::TimeCheckThread::threadLoop()
62 {
63 status_t status = TIMED_OUT;
64 const char *tag;
65 {
66 AutoMutex _l(mMutex);
67
68 if (exitPending()) {
69 return false;
70 }
71
72 nsecs_t endTimeNs = INT64_MAX;
73 // KeyedVector mMonitorRequests is ordered so take first entry as next timeout
74 if (mMonitorRequests.size() != 0) {
75 endTimeNs = mMonitorRequests.keyAt(0);
76 tag = mMonitorRequests.valueAt(0);
77 }
78
79 const nsecs_t waitTimeNs = endTimeNs - systemTime();
80 if (waitTimeNs > 0) {
81 status = mCond.waitRelative(mMutex, waitTimeNs);
82 }
83 }
84 LOG_ALWAYS_FATAL_IF(status != NO_ERROR, "TimeCheck timeout for %s", tag);
85 return true;
86 }

分析

结合 tombstone 文件中其他的日志信息,能够看出 AudioFlinger 线程中发生了 TimeCheck timeout

1
2
3
4
5
6
7
8
01-01 08:00:29.697  3148  3223 I ServiceManager: Waiting for service media.audio_policy...
01-01 08:00:30.699 3148 3223 I ServiceManager: Waiting for service media.audio_policy...
01-01 08:00:31.709 3148 3223 I ServiceManager: Waiting for service media.audio_policy...
01-01 08:00:32.712 3148 3223 I ServiceManager: Waiting for service media.audio_policy...
01-01 08:00:33.717 3148 3223 I ServiceManager: Waiting for service media.audio_policy...
01-01 08:00:33.783 3148 3365 F : TimeCheck timeout for IAudioFlinger
01-01 08:00:33.817 3148 3223 W ServiceManager: Service media.audio_policy didn't start. Returning NULL
01-01 08:00:33.817 3148 3223 W AudioSystem: AudioPolicyService not published, waiting..

结合使用场景,确认问题是系统启动过程中 Audio 设备没有接入引发的,导致 AudioServer 启动过程中无法创建对应的 AudioPolicyService。