Jihoon Kang | 3d38b6d | 2022-10-28 22:21:42 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # |
| 3 | # Copyright (C) 2022 The Android Open Source Project |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
| 17 | from sys import exit |
| 18 | from typing import List |
| 19 | from glob import glob |
| 20 | from pathlib import Path |
| 21 | from collections import defaultdict |
| 22 | from difflib import Differ |
| 23 | from re import split |
| 24 | from tqdm import tqdm |
| 25 | import argparse |
| 26 | |
| 27 | |
| 28 | DIFFER_CODE_LEN = 2 |
| 29 | |
| 30 | class DifferCodes: |
| 31 | COMMON = ' ' |
| 32 | UNIQUE_FIRST = '- ' |
| 33 | UNIQUE_SECOND = '+ ' |
| 34 | DIFF_IDENT = '? ' |
| 35 | |
| 36 | class FilesDiffAnalyzer: |
| 37 | def __init__(self, args) -> None: |
| 38 | self.out_dir = args.out_dir |
| 39 | self.show_diff = args.show_diff |
| 40 | self.skip_words = args.skip_words |
| 41 | self.first_dir = args.first_dir |
| 42 | self.second_dir = args.second_dir |
| 43 | self.include_common = args.include_common |
| 44 | |
| 45 | self.first_dir_files = self.get_files(self.first_dir) |
| 46 | self.second_dir_files = self.get_files(self.second_dir) |
| 47 | self.common_file_map = defaultdict(set) |
| 48 | |
| 49 | self.map_common_files(self.first_dir_files, self.first_dir) |
| 50 | self.map_common_files(self.second_dir_files, self.second_dir) |
| 51 | |
| 52 | def get_files(self, dir: str) -> List[str]: |
| 53 | """Get all files directory in the input directory including the files in the subdirectories |
| 54 | |
| 55 | Recursively finds all files in the input directory. |
| 56 | Returns a list of file directory strings, which do not include directories but only files. |
| 57 | List is sorted in alphabetical order of the file directories. |
| 58 | |
| 59 | Args: |
| 60 | dir: Directory to get the files. String. |
| 61 | |
| 62 | Returns: |
| 63 | A list of file directory strings within the input directory. |
| 64 | Sorted in Alphabetical order. |
| 65 | |
| 66 | Raises: |
| 67 | FileNotFoundError: An error occurred accessing the non-existing directory |
| 68 | """ |
| 69 | |
| 70 | if not dir_exists(dir): |
| 71 | raise FileNotFoundError("Directory does not exist") |
| 72 | |
| 73 | if dir[:-2] != "**": |
| 74 | if dir[:-1] != "/": |
| 75 | dir += "/" |
| 76 | dir += "**" |
| 77 | |
| 78 | return [file for file in sorted(glob(dir, recursive=True)) if Path(file).is_file()] |
| 79 | |
| 80 | def map_common_files(self, files: List[str], dir: str) -> None: |
| 81 | for file in files: |
| 82 | file_name = file.split(dir, 1)[-1] |
| 83 | self.common_file_map[file_name].add(dir) |
| 84 | return |
| 85 | |
| 86 | def compare_file_contents(self, first_file: str, second_file: str) -> List[str]: |
| 87 | """Compare the contents of the files and return different lines |
| 88 | |
| 89 | Given two file directory strings, compare the contents of the two files |
| 90 | and return the list of file contents string prepended with unique identifier codes. |
| 91 | The identifier codes include: |
| 92 | - ' '(two empty space characters): Line common to two files |
| 93 | - '- '(minus followed by a space) : Line unique to first file |
| 94 | - '+ '(plus followed by a space) : Line unique to second file |
| 95 | |
| 96 | Args: |
| 97 | first_file: First file directory string to compare the content |
| 98 | second_file: Second file directory string to compare the content |
| 99 | |
| 100 | Returns: |
| 101 | A list of the file content strings. For example: |
| 102 | |
| 103 | [ |
| 104 | " Foo", |
| 105 | "- Bar", |
| 106 | "+ Baz" |
| 107 | ] |
| 108 | """ |
| 109 | |
| 110 | d = Differ() |
| 111 | first_file_contents = sort_methods(get_file_contents(first_file)) |
| 112 | second_file_contents = sort_methods(get_file_contents(second_file)) |
| 113 | diff = list(d.compare(first_file_contents, second_file_contents)) |
| 114 | ret = [f"diff {first_file} {second_file}"] |
| 115 | |
| 116 | idx = 0 |
| 117 | while idx < len(diff): |
| 118 | line = diff[idx] |
| 119 | line_code = line[:DIFFER_CODE_LEN] |
| 120 | |
| 121 | match line_code: |
| 122 | case DifferCodes.COMMON: |
| 123 | if self.include_common: |
| 124 | ret.append(line) |
| 125 | |
| 126 | case DifferCodes.UNIQUE_FIRST: |
| 127 | # Should compare line |
| 128 | if (idx < len(diff) - 1 and |
| 129 | (next_line_code := diff[idx + 1][:DIFFER_CODE_LEN]) |
| 130 | not in (DifferCodes.UNIQUE_FIRST, DifferCodes.COMMON)): |
| 131 | delta = 1 if next_line_code == DifferCodes.UNIQUE_SECOND else 2 |
| 132 | line_to_compare = diff[idx + delta] |
| 133 | if self.lines_differ(line, line_to_compare): |
| 134 | ret.extend([line, line_to_compare]) |
| 135 | else: |
| 136 | if self.include_common: |
| 137 | ret.append(DifferCodes.COMMON + |
| 138 | line[DIFFER_CODE_LEN:]) |
| 139 | idx += delta |
| 140 | else: |
| 141 | ret.append(line) |
| 142 | |
| 143 | case DifferCodes.UNIQUE_SECOND: |
| 144 | ret.append(line) |
| 145 | |
| 146 | case DifferCodes.DIFF_IDENT: |
| 147 | pass |
| 148 | idx += 1 |
| 149 | return ret |
| 150 | |
| 151 | def lines_differ(self, line1: str, line2: str) -> bool: |
| 152 | """Check if the input lines are different or not |
| 153 | |
| 154 | Compare the two lines word by word and check if the two lines are different or not. |
| 155 | If the different words in the comparing lines are included in skip_words, |
| 156 | the lines are not considered different. |
| 157 | |
| 158 | Args: |
| 159 | line1: first line to compare |
| 160 | line2: second line to compare |
| 161 | |
| 162 | Returns: |
| 163 | Boolean value indicating if the two lines are different or not |
| 164 | |
| 165 | """ |
| 166 | # Split by '.' or ' '(whitespace) |
| 167 | def split_words(line: str) -> List[str]: |
| 168 | return split('\\s|\\.', line[DIFFER_CODE_LEN:]) |
| 169 | |
| 170 | line1_words, line2_words = split_words(line1), split_words(line2) |
| 171 | if len(line1_words) != len(line2_words): |
| 172 | return True |
| 173 | |
| 174 | for word1, word2 in zip(line1_words, line2_words): |
| 175 | if word1 != word2: |
| 176 | # not check if words are equal to skip word, but |
| 177 | # check if words contain skip word as substring |
| 178 | if all(sw not in word1 and sw not in word2 for sw in self.skip_words): |
| 179 | return True |
| 180 | |
| 181 | return False |
| 182 | |
| 183 | def analyze(self) -> None: |
| 184 | """Analyze file contents in both directories and write to output or console. |
| 185 | """ |
| 186 | for file in tqdm(sorted(self.common_file_map.keys())): |
| 187 | val = self.common_file_map[file] |
| 188 | |
| 189 | # When file exists in both directories |
| 190 | lines = list() |
| 191 | if val == set([self.first_dir, self.second_dir]): |
| 192 | lines = self.compare_file_contents( |
| 193 | self.first_dir + file, self.second_dir + file) |
| 194 | else: |
| 195 | existing_dir, not_existing_dir = ( |
| 196 | (self.first_dir, self.second_dir) if self.first_dir in val |
| 197 | else (self.second_dir, self.first_dir)) |
| 198 | |
| 199 | lines = [f"{not_existing_dir}{file} does not exist."] |
| 200 | |
| 201 | if self.show_diff: |
| 202 | lines.append(f"Content of {existing_dir}{file}: \n") |
| 203 | lines.extend(get_file_contents(existing_dir + file)) |
| 204 | |
| 205 | self.write(lines) |
| 206 | |
| 207 | def write(self, lines: List[str]) -> None: |
| 208 | if self.out_dir == "": |
| 209 | pprint(lines) |
| 210 | else: |
| 211 | write_lines(self.out_dir, lines) |
| 212 | |
| 213 | ### |
| 214 | # Helper functions |
| 215 | ### |
| 216 | |
| 217 | def sort_methods(lines: List[str]) -> List[str]: |
| 218 | """Sort class methods in the file contents by alphabetical order |
| 219 | |
| 220 | Given lines of Java file contents, return lines with class methods sorted in alphabetical order. |
| 221 | Also omit empty lines or lines with spaces. |
| 222 | For example: |
| 223 | l = [ |
| 224 | "package android.test;", |
| 225 | "", |
| 226 | "public static final int ORANGE = 1;", |
| 227 | "", |
| 228 | "public class TestClass {", |
| 229 | "public TestClass() { throw new RuntimeException("Stub!"); }", |
| 230 | "public void foo() { throw new RuntimeException("Stub!"); }", |
| 231 | "public void bar() { throw new RuntimeException("Stub!"); }", |
| 232 | "}" |
| 233 | ] |
| 234 | sort_methods(l) returns |
| 235 | [ |
| 236 | "package android.test;", |
| 237 | "public static final int ORANGE = 1;", |
| 238 | "public class TestClass {", |
| 239 | "public TestClass() { throw new RuntimeException("Stub!"); }", |
| 240 | "public void bar() { throw new RuntimeException("Stub!"); }", |
| 241 | "public void foo() { throw new RuntimeException("Stub!"); }", |
| 242 | "}" |
| 243 | ] |
| 244 | |
| 245 | Args: |
| 246 | lines: List of strings consisted of Java file contents. |
| 247 | |
| 248 | Returns: |
| 249 | A list of string with sorted class methods. |
| 250 | |
| 251 | """ |
| 252 | def is_not_blank(l: str) -> bool: |
| 253 | return bool(l) and not l.isspace() |
| 254 | |
| 255 | ret = list() |
| 256 | |
| 257 | in_class = False |
| 258 | buffer = list() |
| 259 | for line in lines: |
| 260 | if not in_class: |
| 261 | if "class" in line: |
| 262 | in_class = True |
| 263 | ret.append(line) |
| 264 | else: |
| 265 | # Adding static variables, package info, etc. |
| 266 | # Skipping empty or space lines. |
| 267 | if is_not_blank(line): |
| 268 | ret.append(line) |
| 269 | else: |
| 270 | # End of class |
| 271 | if line and line[0] == "}": |
| 272 | in_class = False |
| 273 | ret.extend(sorted(buffer)) |
| 274 | buffer = list() |
| 275 | ret.append(line) |
| 276 | else: |
| 277 | if is_not_blank(line): |
| 278 | buffer.append(line) |
| 279 | |
| 280 | return ret |
| 281 | |
| 282 | def get_file_contents(file_path: str) -> List[str]: |
| 283 | lines = list() |
| 284 | with open(file_path) as f: |
| 285 | lines = [line.rstrip('\n') for line in f] |
| 286 | f.close() |
| 287 | return lines |
| 288 | |
| 289 | def pprint(l: List[str]) -> None: |
| 290 | for line in l: |
| 291 | print(line) |
| 292 | |
| 293 | def write_lines(out_dir: str, lines: List[str]) -> None: |
| 294 | with open(out_dir, "a") as f: |
| 295 | f.writelines(line + '\n' for line in lines) |
| 296 | f.write("\n") |
| 297 | f.close() |
| 298 | |
| 299 | def dir_exists(dir: str) -> bool: |
| 300 | return Path(dir).exists() |
| 301 | |
| 302 | if __name__ == '__main__': |
| 303 | parser = argparse.ArgumentParser() |
| 304 | parser.add_argument('first_dir', action='store', type=str, |
| 305 | help="first path to compare file directory and contents") |
| 306 | parser.add_argument('second_dir', action='store', type=str, |
| 307 | help="second path to compare file directory and contents") |
| 308 | parser.add_argument('--out', dest='out_dir', |
| 309 | action='store', default="", type=str, |
| 310 | help="optional directory to write log. If not set, will print to console") |
| 311 | parser.add_argument('--show-diff-file', dest='show_diff', |
| 312 | action=argparse.BooleanOptionalAction, |
| 313 | help="optional flag. If passed, will print out the content of the file unique to each directories") |
| 314 | parser.add_argument('--include-common', dest='include_common', |
| 315 | action=argparse.BooleanOptionalAction, |
| 316 | help="optional flag. If passed, will print out the contents common to both files as well,\ |
| 317 | instead of printing only diff lines.") |
| 318 | parser.add_argument('--skip-words', nargs='+', |
| 319 | dest='skip_words', default=[], help="optional words to skip in comparison") |
| 320 | |
| 321 | args = parser.parse_args() |
| 322 | |
| 323 | if not args.first_dir or not args.second_dir: |
| 324 | parser.print_usage() |
| 325 | exit(0) |
| 326 | |
| 327 | analyzer = FilesDiffAnalyzer(args) |
| 328 | analyzer.analyze() |