David Srbecky | 8bb486a | 2022-01-12 13:31:00 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
| 2 | # |
| 3 | # Copyright (C) 2022 The Android Open Source Project |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
| 17 | """ |
| 18 | Checks dwarf CFI (unwinding) information by comparing it to disassembly. |
| 19 | It is only a simple heuristic check of stack pointer adjustments. |
| 20 | Fully inferring CFI from disassembly is not possible in general. |
| 21 | """ |
| 22 | |
| 23 | import os, re, subprocess, collections, pathlib, bisect, collections |
| 24 | from typing import List, Optional, Set, Tuple |
| 25 | |
| 26 | Source = collections.namedtuple("Source", ["addr", "file", "line", "flag"]) |
| 27 | |
| 28 | def get_source(lib: pathlib.Path) -> List[Source]: |
| 29 | """ Get source-file and line-number for all hand-written assembly code. """ |
| 30 | |
| 31 | proc = subprocess.run(["llvm-dwarfdump", "--debug-line", lib], |
| 32 | encoding='utf-8', |
| 33 | capture_output=True, |
| 34 | check=True) |
| 35 | |
| 36 | section_re = re.compile("^debug_line\[0x[0-9a-f]+\]$", re.MULTILINE) |
| 37 | filename_re = re.compile('file_names\[ *(\d)+\]:\n\s*name: "(.*)"', re.MULTILINE) |
| 38 | line_re = re.compile('0x([0-9a-f]{16}) +(\d+) +\d+ +(\d+)' # addr, line, column, file |
| 39 | ' +\d+ +\d +(.*)') # isa, discriminator, flag |
| 40 | |
| 41 | results = [] |
| 42 | for section in section_re.split(proc.stdout): |
| 43 | files = {m[1]: m[2] for m in filename_re.finditer(section)} |
| 44 | if not any(f.endswith(".S") for f in files.values()): |
| 45 | continue |
| 46 | lines = line_re.findall(section) |
| 47 | results.extend([Source(int(a, 16), files[fn], l, fg) for a, l, fn, fg in lines]) |
| 48 | return sorted(filter(lambda line: "end_sequence" not in line.flag, results)) |
| 49 | |
| 50 | Fde = collections.namedtuple("Fde", ["addr", "end", "data"]) |
| 51 | |
| 52 | def get_fde(lib: pathlib.Path) -> List[Fde]: |
| 53 | """ Get all unwinding FDE blocks (in dumped text-based format) """ |
| 54 | |
| 55 | proc = subprocess.run(["llvm-dwarfdump", "--debug-frame", lib], |
| 56 | encoding='utf-8', |
| 57 | capture_output=True, |
| 58 | check=True) |
| 59 | |
| 60 | section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. |
| 61 | fda_re = re.compile(".* FDE .* pc=([0-9a-f]+)...([0-9a-f]+)") |
| 62 | |
| 63 | results = [] |
| 64 | for section in section_re.split(proc.stdout): |
| 65 | m = fda_re.match(section) |
| 66 | if m: |
| 67 | fde = Fde(int(m[1], 16), int(m[2], 16), section) |
| 68 | if fde.addr != 0: |
| 69 | results.append(fde) |
| 70 | return sorted(results) |
| 71 | |
| 72 | Asm = collections.namedtuple("Asm", ["addr", "name", "data"]) |
| 73 | |
| 74 | def get_asm(lib: pathlib.Path) -> List[Asm]: |
| 75 | """ Get disassembly for all methods (in dumped text-based format) """ |
| 76 | |
| 77 | proc = subprocess.run(["llvm-objdump", "--disassemble", lib], |
| 78 | encoding='utf-8', |
| 79 | capture_output=True, |
| 80 | check=True) |
| 81 | |
| 82 | section_re = re.compile("\n(?! |\n)", re.MULTILINE) # New-line not followed by indent. |
| 83 | sym_re = re.compile("([0-9a-f]+) <(.+)>:") |
| 84 | |
| 85 | results = [] |
| 86 | for section in section_re.split(proc.stdout): |
| 87 | sym = sym_re.match(section) |
| 88 | if sym: |
| 89 | results.append(Asm(int(sym[1], 16), sym[2], section)) |
| 90 | return sorted(results) |
| 91 | |
| 92 | Cfa = collections.namedtuple("Cfa", ["addr", "cfa"]) |
| 93 | |
| 94 | def get_cfa(fde: Fde) -> List[Cfa]: |
| 95 | """ Extract individual CFA (SP+offset) entries from the FDE block """ |
| 96 | |
| 97 | cfa_re = re.compile("0x([0-9a-f]+): CFA=([^\s:]+)") |
| 98 | return [Cfa(int(addr, 16), cfa) for addr, cfa in cfa_re.findall(fde.data)] |
| 99 | |
| 100 | Inst = collections.namedtuple("Inst", ["addr", "inst", "symbol"]) |
| 101 | |
| 102 | def get_instructions(asm: Asm) -> List[Inst]: |
| 103 | """ Extract individual instructions from disassembled code block """ |
| 104 | |
| 105 | data = re.sub(r"[ \t]+", " ", asm.data) |
| 106 | inst_re = re.compile(r"([0-9a-f]+): +(?:[0-9a-f]{2} +)*(.*)") |
| 107 | return [Inst(int(addr, 16), inst, asm.name) for addr, inst in inst_re.findall(data)] |
| 108 | |
| 109 | CfaOffset = collections.namedtuple("CfaOffset", ["addr", "offset"]) |
| 110 | |
| 111 | def get_dwarf_cfa_offsets(cfas: List[Cfa]) -> List[CfaOffset]: |
| 112 | """ Parse textual CFA entries into integer stack offsets """ |
| 113 | |
| 114 | result = [] |
| 115 | for addr, cfa in cfas: |
| 116 | if cfa == "WSP" or cfa == "SP": |
| 117 | result.append(CfaOffset(addr, 0)) |
| 118 | elif cfa.startswith("WSP+") or cfa.startswith("SP+"): |
| 119 | result.append(CfaOffset(addr, int(cfa.split("+")[1]))) |
| 120 | else: |
| 121 | result.append(CfaOffset(addr, None)) |
| 122 | return result |
| 123 | |
| 124 | def get_infered_cfa_offsets(insts: List[Inst]) -> List[CfaOffset]: |
| 125 | """ Heuristic to convert disassembly into stack offsets """ |
| 126 | |
| 127 | # Regular expressions which find instructions that adjust stack pointer. |
| 128 | rexprs = [] |
| 129 | def add(rexpr, adjust_offset): |
| 130 | rexprs.append((re.compile(rexpr), adjust_offset)) |
| 131 | add(r"sub sp,(?: sp,)? #(\d+)", lambda m: int(m[1])) |
| 132 | add(r"add sp,(?: sp,)? #(\d+)", lambda m: -int(m[1])) |
| 133 | add(r"str \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1])) |
| 134 | add(r"ldr \w+, \[sp\], #(\d+)", lambda m: -int(m[1])) |
| 135 | add(r"stp \w+, \w+, \[sp, #-(\d+)\]!", lambda m: int(m[1])) |
| 136 | add(r"ldp \w+, \w+, \[sp\], #(\d+)", lambda m: -int(m[1])) |
| 137 | add(r"vpush \{([d0-9, ]*)\}", lambda m: 8 * len(m[1].split(","))) |
| 138 | add(r"vpop \{([d0-9, ]*)\}", lambda m: -8 * len(m[1].split(","))) |
| 139 | add(r"v?push(?:\.w)? \{([\w+, ]*)\}", lambda m: 4 * len(m[1].split(","))) |
| 140 | add(r"v?pop(?:\.w)? \{([\w+, ]*)\}", lambda m: -4 * len(m[1].split(","))) |
| 141 | |
| 142 | # Regular expression which identifies branches. |
| 143 | jmp_re = re.compile(r"cb\w* \w+, 0x(\w+)|(?:b|bl|b\w\w) 0x(\w+)") |
| 144 | |
| 145 | offset, future_offset = 0, {} |
| 146 | result = [CfaOffset(insts[0].addr, offset)] |
| 147 | for addr, inst, symbol in insts: |
| 148 | # Previous code branched here, so us that offset instead. |
| 149 | # This likely identifies slow-path which is after return. |
| 150 | if addr in future_offset: |
| 151 | offset = future_offset[addr] |
| 152 | |
| 153 | # Add entry to output (only if the offset changed). |
| 154 | if result[-1].offset != offset: |
| 155 | result.append(CfaOffset(addr, offset)) |
| 156 | |
| 157 | # Adjust offset if the instruction modifies stack pointer. |
| 158 | for rexpr, adjust_offset in rexprs: |
| 159 | m = rexpr.match(inst) |
| 160 | if m: |
| 161 | offset += adjust_offset(m) |
| 162 | break # First matched pattern wins. |
| 163 | |
| 164 | # Record branches. We only support forward edges for now. |
| 165 | m = jmp_re.match(inst) |
| 166 | if m: |
| 167 | future_offset[int(m[m.lastindex], 16)] = offset |
| 168 | return result |
| 169 | |
| 170 | def check_fde(fde: Fde, insts: List[Inst], srcs, verbose: bool = False) -> Tuple[str, Set[int]]: |
| 171 | """ Compare DWARF offsets to assembly-inferred offsets. Report differences. """ |
| 172 | |
| 173 | error, seen_addrs = None, set() |
| 174 | cfas = get_cfa(fde) |
| 175 | i, dwarf_cfa = 0, get_dwarf_cfa_offsets(cfas) |
| 176 | j, infered_cfa = 0, get_infered_cfa_offsets(insts) |
| 177 | for inst in insts: |
| 178 | seen_addrs.add(inst.addr) |
| 179 | while i+1 < len(dwarf_cfa) and dwarf_cfa[i+1].addr <= inst.addr: |
| 180 | i += 1 |
| 181 | while j+1 < len(infered_cfa) and infered_cfa[j+1].addr <= inst.addr: |
| 182 | j += 1 |
| 183 | if verbose: |
| 184 | print("{:08x}: dwarf={:4} infered={:4} {:40} // {}".format( |
| 185 | inst.addr, str(dwarf_cfa[i].offset), str(infered_cfa[j].offset), |
| 186 | inst.inst.strip(), srcs.get(inst.addr, ""))) |
| 187 | if dwarf_cfa[i].offset is not None and dwarf_cfa[i].offset != infered_cfa[j].offset: |
| 188 | if inst.addr in srcs: # Only report if it maps to source code (not padding or literals). |
| 189 | error = error or "{:08x} {}".format(inst.addr, srcs.get(inst.addr, "")) |
| 190 | return error, seen_addrs |
| 191 | |
| 192 | def check_lib(lib: pathlib.Path): |
| 193 | assert lib.exists() |
| 194 | IGNORE = [ |
| 195 | "art_quick_throw_null_pointer_exception_from_signal", # Starts with non-zero offset. |
| 196 | "art_quick_generic_jni_trampoline", # Saves/restores SP in other register. |
| 197 | "nterp_op_", # Uses calculated CFA due to dynamic stack size. |
| 198 | "$d.", # Data (literals) interleaved within code. |
| 199 | ] |
| 200 | fdes = get_fde(lib) |
| 201 | asms = collections.deque(get_asm(lib)) |
| 202 | srcs = {src.addr: src.file + ":" + src.line for src in get_source(lib)} |
| 203 | seen = set() # Used to verify the we have covered all assembly source lines. |
| 204 | |
| 205 | for fde in fdes: |
| 206 | if fde.addr not in srcs: |
| 207 | continue # Ignore if it is not hand-written assembly. |
| 208 | |
| 209 | # Assembly instructions (one FDE can cover several assembly chunks). |
| 210 | all_insts, name = [], None |
| 211 | while asms and asms[0].addr < fde.end: |
| 212 | asm = asms.popleft() |
| 213 | if asm.addr < fde.addr: |
| 214 | continue |
| 215 | insts = get_instructions(asm) |
| 216 | if any(asm.name.startswith(i) for i in IGNORE): |
| 217 | seen.update([inst.addr for inst in insts]) |
| 218 | continue |
| 219 | all_insts.extend(insts) |
| 220 | name = name or asm.name |
| 221 | if not all_insts: |
| 222 | continue # No assembly |
| 223 | |
| 224 | # Compare DWARF data to assembly instructions |
| 225 | error, seen_addrs = check_fde(fde, all_insts, srcs) |
| 226 | if error: |
| 227 | print("ERROR at " + name + " " + error) |
| 228 | check_fde(fde, all_insts, srcs, True) |
| 229 | print("") |
| 230 | seen.update(seen_addrs) |
| 231 | for addr in sorted(set(srcs.keys()) - seen): |
| 232 | print("Missing CFI for {:08x}: {}".format(addr, srcs[addr])) |
| 233 | |
| 234 | |
| 235 | def main(argv): |
| 236 | """ Check libraries provided on the command line, or use the default build output """ |
| 237 | |
| 238 | libs = argv[1:] |
| 239 | if not libs: |
| 240 | out = os.environ["OUT"] |
| 241 | libs.append(out + "/symbols/apex/com.android.art/lib/libart.so") |
| 242 | libs.append(out + "/symbols/apex/com.android.art/lib64/libart.so") |
| 243 | for lib in libs: |
| 244 | check_lib(pathlib.Path(lib)) |
| 245 | |
| 246 | if __name__ == "__main__": |
| 247 | main(os.sys.argv) |