blob: ac258c28aa18329a3b110a6f2cea0fc56241459b [file] [log] [blame]
David Srbecky8bb486a2022-01-12 13:31:00 +00001#!/usr/bin/env python3
2#
3# Copyright (C) 2022 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Checks dwarf CFI (unwinding) information by comparing it to disassembly.
19It is only a simple heuristic check of stack pointer adjustments.
20Fully inferring CFI from disassembly is not possible in general.
21"""
22
23import os, re, subprocess, collections, pathlib, bisect, collections
24from typing import List, Optional, Set, Tuple
25
26Source = collections.namedtuple("Source", ["addr", "file", "line", "flag"])
27
28def get_source(lib: pathlib.Path) -> List[Source]:
29 """ Get source-file and line-number for all hand-written assembly code. """
30
31 proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib],
32 encoding='utf-8',
33 capture_output=True,
34 check=True)
35
36 section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE)
37 filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE)
38 line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # addr, line, column, file
39 ' +\d+ +\d +(.*)') # isa, discriminator, flag
40
41 results = []
42 for section in section_re.split(proc.stdout):
43 files = {m[1]: m[2] for m in filename_re.finditer(section)}
44 if not any(f.endswith(".S") for f in files.values()):
45 continue
46 lines = line_re.findall(section)
47 results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines])
48 return sorted(filter(lambda line: "end_sequence" not in line.flag, results))
49
50Fde = collections.namedtuple("Fde", ["addr", "end", "data"])
51
52def get_fde(lib: pathlib.Path) -> List[Fde]:
53 """ Get all unwinding FDE blocks (in dumped text-based format) """
54
55 proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib],
56 encoding='utf-8',
57 capture_output=True,
58 check=True)
59
60 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent.
61 fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)")
62
63 results = []
64 for section in section_re.split(proc.stdout):
65 m = fda_re.match(section)
66 if m:
67 fde = Fde(int(m[1], 16), int(m[2], 16), section)
68 if fde.addr != 0:
69 results.append(fde)
70 return sorted(results)
71
72Asm = collections.namedtuple("Asm", ["addr", "name", "data"])
73
74def get_asm(lib: pathlib.Path) -> List[Asm]:
75 """ Get disassembly for all methods (in dumped text-based format) """
76
77 proc = subprocess.run(["llvm-objdump", "--disassemble", lib],
78 encoding='utf-8',
79 capture_output=True,
80 check=True)
81
82 section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent.
83 sym_re = re.compile("([0-9a-f]+) <(.+)>:")
84
85 results = []
86 for section in section_re.split(proc.stdout):
87 sym = sym_re.match(section)
88 if sym:
89 results.append(Asm(int(sym[1], 16), sym[2], section))
90 return sorted(results)
91
92Cfa = collections.namedtuple("Cfa", ["addr", "cfa"])
93
94def get_cfa(fde: Fde) -> List[Cfa]:
95 """ Extract individual CFA (SP+offset) entries from the FDE block """
96
97 cfa_re = re.compile("0x([0-9a-f]+): CFA=([^\s:]+)")
98 return [Cfa(int(addr, 16), cfa) for addr, cfa in cfa_re.findall(fde.data)]
99
100Inst = collections.namedtuple("Inst", ["addr", "inst", "symbol"])
101
102def get_instructions(asm: Asm) -> List[Inst]:
103 """ Extract individual instructions from disassembled code block """
104
105 data = re.sub(r"[ \t]+", " ", asm.data)
106 inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)")
107 return [Inst(int(addr, 16), inst, asm.name) for addr, inst in inst_re.findall(data)]
108
109CfaOffset = collections.namedtuple("CfaOffset", ["addr", "offset"])
110
111def get_dwarf_cfa_offsets(cfas: List[Cfa]) -> List[CfaOffset]:
112 """ Parse textual CFA entries into integer stack offsets """
113
114 result = []
115 for addr, cfa in cfas:
116 if cfa == "WSP" or cfa == "SP":
117 result.append(CfaOffset(addr, 0))
118 elif cfa.startswith("WSP+") or cfa.startswith("SP+"):
119 result.append(CfaOffset(addr, int(cfa.split("+")[1])))
120 else:
121 result.append(CfaOffset(addr, None))
122 return result
123
124def get_infered_cfa_offsets(insts: List[Inst]) -> List[CfaOffset]:
125 """ Heuristic to convert disassembly into stack offsets """
126
127 # Regular expressions which find instructions that adjust stack pointer.
128 rexprs = []
129 def add(rexpr, adjust_offset):
130 rexprs.append((re.compile(rexpr), adjust_offset))
131 add(r"sub sp,(?: sp,)? #(\d+)", lambda m: int(m[1]))
132 add(r"add sp,(?: sp,)? #(\d+)", lambda m: -int(m[1]))
133 add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
134 add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
135 add(r"stp \w+, \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1]))
136 add(r"ldp \w+, \w+, \[sp\], #(\d+)", lambda m: -int(m[1]))
137 add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(",")))
138 add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(",")))
139 add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(",")))
140 add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(",")))
141
142 # Regular expression which identifies branches.
143 jmp_re = re.compile(r"cb\w* \w+, 0x(\w+)|(?:b|bl|b\w\w) 0x(\w+)")
144
145 offset, future_offset = 0, {}
146 result = [CfaOffset(insts[0].addr, offset)]
147 for addr, inst, symbol in insts:
148 # Previous code branched here, so us that offset instead.
149 # This likely identifies slow-path which is after return.
150 if addr in future_offset:
151 offset = future_offset[addr]
152
153 # Add entry to output (only if the offset changed).
154 if result[-1].offset != offset:
155 result.append(CfaOffset(addr, offset))
156
157 # Adjust offset if the instruction modifies stack pointer.
158 for rexpr, adjust_offset in rexprs:
159 m = rexpr.match(inst)
160 if m:
161 offset += adjust_offset(m)
162 break # First matched pattern wins.
163
164 # Record branches. We only support forward edges for now.
165 m = jmp_re.match(inst)
166 if m:
167 future_offset[int(m[m.lastindex], 16)] = offset
168 return result
169
170def check_fde(fde: Fde, insts: List[Inst], srcs, verbose: bool = False) -> Tuple[str, Set[int]]:
171 """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """
172
173 error, seen_addrs = None, set()
174 cfas = get_cfa(fde)
175 i, dwarf_cfa = 0, get_dwarf_cfa_offsets(cfas)
176 j, infered_cfa = 0, get_infered_cfa_offsets(insts)
177 for inst in insts:
178 seen_addrs.add(inst.addr)
179 while i+1 < len(dwarf_cfa) and dwarf_cfa[i+1].addr <= inst.addr:
180 i += 1
181 while j+1 < len(infered_cfa) and infered_cfa[j+1].addr <= inst.addr:
182 j += 1
183 if verbose:
184 print("{:08x}: dwarf={:4} infered={:4} {:40} // {}".format(
185 inst.addr, str(dwarf_cfa[i].offset), str(infered_cfa[j].offset),
186 inst.inst.strip(), srcs.get(inst.addr, "")))
187 if dwarf_cfa[i].offset is not None and dwarf_cfa[i].offset != infered_cfa[j].offset:
188 if inst.addr in srcs: # Only report if it maps to source code (not padding or literals).
189 error = error or "{:08x} {}".format(inst.addr, srcs.get(inst.addr, ""))
190 return error, seen_addrs
191
192def check_lib(lib: pathlib.Path):
193 assert lib.exists()
194 IGNORE = [
195 "art_quick_throw_null_pointer_exception_from_signal", # Starts with non-zero offset.
196 "art_quick_generic_jni_trampoline", # Saves/restores SP in other register.
197 "nterp_op_", # Uses calculated CFA due to dynamic stack size.
198 "$d.", # Data (literals) interleaved within code.
199 ]
200 fdes = get_fde(lib)
201 asms = collections.deque(get_asm(lib))
202 srcs = {src.addr: src.file + ":" + src.line for src in get_source(lib)}
203 seen = set() # Used to verify the we have covered all assembly source lines.
204
205 for fde in fdes:
206 if fde.addr not in srcs:
207 continue # Ignore if it is not hand-written assembly.
208
209 # Assembly instructions (one FDE can cover several assembly chunks).
210 all_insts, name = [], None
211 while asms and asms[0].addr < fde.end:
212 asm = asms.popleft()
213 if asm.addr < fde.addr:
214 continue
215 insts = get_instructions(asm)
216 if any(asm.name.startswith(i) for i in IGNORE):
217 seen.update([inst.addr for inst in insts])
218 continue
219 all_insts.extend(insts)
220 name = name or asm.name
221 if not all_insts:
222 continue # No assembly
223
224 # Compare DWARF data to assembly instructions
225 error, seen_addrs = check_fde(fde, all_insts, srcs)
226 if error:
227 print("ERROR at " + name + " " + error)
228 check_fde(fde, all_insts, srcs, True)
229 print("")
230 seen.update(seen_addrs)
231 for addr in sorted(set(srcs.keys()) - seen):
232 print("Missing CFI for {:08x}: {}".format(addr, srcs[addr]))
233
234
235def main(argv):
236 """ Check libraries provided on the command line, or use the default build output """
237
238 libs = argv[1:]
239 if not libs:
240 out = os.environ["OUT"]
241 libs.append(out + "/symbols/apex/com.android.art/lib/libart.so")
242 libs.append(out + "/symbols/apex/com.android.art/lib64/libart.so")
243 for lib in libs:
244 check_lib(pathlib.Path(lib))
245
246if __name__ == "__main__":
247 main(os.sys.argv)