blob: 5e3010facc639c95bfcf4d055bad3f64218b3507 [file] [log] [blame]
Christopher Ferris6a546332021-09-08 13:59:04 -07001#!/usr/bin/env python3
Daniel Berlinf5a97d72012-03-29 10:33:19 -04002#
3# Copyright (C) 2012 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""
Jaekyun Seok3b7560b2017-04-19 15:26:47 +090017Usage: generate-notice-files --text-output [plain text output file] \
18 --html-output [html output file] \
19 --xml-output [xml output file] \
20 -t [file title] -s [directory of notices]
Daniel Berlinf5a97d72012-03-29 10:33:19 -040021
22Generate the Android notice files, including both text and html files.
23
24-h to display this usage message and exit.
25"""
26from collections import defaultdict
Jaekyun Seok3b7560b2017-04-19 15:26:47 +090027import argparse
Daniel Berlinf5a97d72012-03-29 10:33:19 -040028import hashlib
29import itertools
30import os
31import os.path
32import re
Christopher Ferris6a546332021-09-08 13:59:04 -070033import struct
Daniel Berlinf5a97d72012-03-29 10:33:19 -040034import sys
35
36MD5_BLOCKSIZE = 1024 * 1024
37HTML_ESCAPE_TABLE = {
Christopher Ferris6a546332021-09-08 13:59:04 -070038 b"&": b"&",
39 b'"': b""",
40 b"'": b"'",
41 b">": b">",
42 b"<": b"&lt;",
Daniel Berlinf5a97d72012-03-29 10:33:19 -040043 }
44
Daniel Berlinf5a97d72012-03-29 10:33:19 -040045def md5sum(filename):
46 """Calculate an MD5 of the file given by FILENAME,
47 and return hex digest as a string.
48 Output should be compatible with md5sum command"""
49
50 f = open(filename, "rb")
51 sum = hashlib.md5()
52 while 1:
53 block = f.read(MD5_BLOCKSIZE)
54 if not block:
55 break
56 sum.update(block)
57 f.close()
Christopher Ferris6a546332021-09-08 13:59:04 -070058 return sum.hexdigest()
Daniel Berlinf5a97d72012-03-29 10:33:19 -040059
60
61def html_escape(text):
62 """Produce entities within text."""
Christopher Ferris6a546332021-09-08 13:59:04 -070063 # Using for i in text doesn't work since i will be an int, not a byte.
64 # There are multiple ways to solve this, but the most performant way
65 # to iterate over a byte array is to use unpack. Using the
66 # for i in range(len(text)) and using that to get a byte using array
67 # slices is twice as slow as this method.
68 return b"".join(HTML_ESCAPE_TABLE.get(i,i) for i in struct.unpack(str(len(text)) + 'c', text))
Daniel Berlinf5a97d72012-03-29 10:33:19 -040069
Christopher Ferris6a546332021-09-08 13:59:04 -070070HTML_OUTPUT_CSS=b"""
Daniel Berlinf5a97d72012-03-29 10:33:19 -040071<style type="text/css">
72body { padding: 0; font-family: sans-serif; }
73.same-license { background-color: #eeeeee; border-top: 20px solid white; padding: 10px; }
74.label { font-weight: bold; }
75.file-list { margin-left: 1em; color: blue; }
76</style>
Christopher Ferris6a546332021-09-08 13:59:04 -070077
Daniel Berlinf5a97d72012-03-29 10:33:19 -040078"""
79
Bob Badour5e9e1fb2020-07-17 20:47:42 -070080def combine_notice_files_html(file_hash, input_dirs, output_filename):
Daniel Berlinf5a97d72012-03-29 10:33:19 -040081 """Combine notice files in FILE_HASH and output a HTML version to OUTPUT_FILENAME."""
82
Bob Badour5e9e1fb2020-07-17 20:47:42 -070083 SRC_DIR_STRIP_RE = re.compile("(?:" + "|".join(input_dirs) + ")(/.*).txt")
Daniel Berlinf5a97d72012-03-29 10:33:19 -040084
85 # Set up a filename to row id table (anchors inside tables don't work in
86 # most browsers, but href's to table row ids do)
87 id_table = {}
88 id_count = 0
Dan Willemsen8ae49842015-11-10 12:37:23 -080089 for value in file_hash:
Daniel Berlinf5a97d72012-03-29 10:33:19 -040090 for filename in value:
91 id_table[filename] = id_count
92 id_count += 1
93
94 # Open the output file, and output the header pieces
95 output_file = open(output_filename, "wb")
96
Christopher Ferris6a546332021-09-08 13:59:04 -070097 output_file.write(b"<html><head>\n")
98 output_file.write(HTML_OUTPUT_CSS)
99 output_file.write(b'</head><body topmargin="0" leftmargin="0" rightmargin="0" bottommargin="0">\n')
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400100
101 # Output our table of contents
Christopher Ferris6a546332021-09-08 13:59:04 -0700102 output_file.write(b'<div class="toc">\n')
103 output_file.write(b"<ul>\n")
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400104
105 # Flatten the list of lists into a single list of filenames
Dan Willemsen8ae49842015-11-10 12:37:23 -0800106 sorted_filenames = sorted(itertools.chain.from_iterable(file_hash))
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400107
108 # Print out a nice table of contents
109 for filename in sorted_filenames:
110 stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
Christopher Ferris6a546332021-09-08 13:59:04 -0700111 output_file.write(('<li><a href="#id%d">%s</a></li>\n' % (id_table.get(filename), stripped_filename)).encode())
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400112
Christopher Ferris6a546332021-09-08 13:59:04 -0700113 output_file.write(b"</ul>\n")
114 output_file.write(b"</div><!-- table of contents -->\n")
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400115 # Output the individual notice file lists
Christopher Ferris6a546332021-09-08 13:59:04 -0700116 output_file.write(b'<table cellpadding="0" cellspacing="0" border="0">\n')
Dan Willemsen8ae49842015-11-10 12:37:23 -0800117 for value in file_hash:
Christopher Ferris6a546332021-09-08 13:59:04 -0700118 output_file.write(b'<tr id="id%d"><td class="same-license">\n' % id_table.get(value[0]))
119 output_file.write(b'<div class="label">Notices for file(s):</div>\n')
120 output_file.write(b'<div class="file-list">\n')
Dan Willemsen8ae49842015-11-10 12:37:23 -0800121 for filename in value:
Christopher Ferris6a546332021-09-08 13:59:04 -0700122 output_file.write(("%s <br/>\n" % SRC_DIR_STRIP_RE.sub(r"\1", filename)).encode())
123 output_file.write(b"</div><!-- file-list -->\n")
124 output_file.write(b"\n")
125 output_file.write(b'<pre class="license-text">\n')
126 with open(value[0], "rb") as notice_file:
127 output_file.write(html_escape(notice_file.read()))
128 output_file.write(b"\n</pre><!-- license-text -->\n")
129 output_file.write(b"</td></tr><!-- same-license -->\n\n\n\n")
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400130
131 # Finish off the file output
Christopher Ferris6a546332021-09-08 13:59:04 -0700132 output_file.write(b"</table>\n")
133 output_file.write(b"</body></html>\n")
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400134 output_file.close()
135
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700136def combine_notice_files_text(file_hash, input_dirs, output_filename, file_title):
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400137 """Combine notice files in FILE_HASH and output a text version to OUTPUT_FILENAME."""
138
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700139 SRC_DIR_STRIP_RE = re.compile("(?:" + "|".join(input_dirs) + ")(/.*).txt")
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400140 output_file = open(output_filename, "wb")
Christopher Ferris6a546332021-09-08 13:59:04 -0700141 output_file.write(file_title.encode())
142 output_file.write(b"\n")
Dan Willemsen8ae49842015-11-10 12:37:23 -0800143 for value in file_hash:
Christopher Ferris6a546332021-09-08 13:59:04 -0700144 output_file.write(b"============================================================\n")
145 output_file.write(b"Notices for file(s):\n")
146 for filename in value:
147 output_file.write(SRC_DIR_STRIP_RE.sub(r"\1", filename).encode())
148 output_file.write(b"\n")
149 output_file.write(b"------------------------------------------------------------\n")
150 with open(value[0], "rb") as notice_file:
151 output_file.write(notice_file.read())
152 output_file.write(b"\n")
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400153 output_file.close()
154
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700155def combine_notice_files_xml(files_with_same_hash, input_dirs, output_filename):
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900156 """Combine notice files in FILE_HASH and output a XML version to OUTPUT_FILENAME."""
157
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700158 SRC_DIR_STRIP_RE = re.compile("(?:" + "|".join(input_dirs) + ")(/.*).txt")
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900159
160 # Set up a filename to row id table (anchors inside tables don't work in
161 # most browsers, but href's to table row ids do)
162 id_table = {}
Christopher Ferris6a546332021-09-08 13:59:04 -0700163 for file_key, files in files_with_same_hash.items():
164 for filename in files:
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900165 id_table[filename] = file_key
166
167 # Open the output file, and output the header pieces
168 output_file = open(output_filename, "wb")
169
Christopher Ferris6a546332021-09-08 13:59:04 -0700170 output_file.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
171 output_file.write(b"<licenses>\n")
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900172
173 # Flatten the list of lists into a single list of filenames
174 sorted_filenames = sorted(id_table.keys())
175
176 # Print out a nice table of contents
177 for filename in sorted_filenames:
178 stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
Christopher Ferris6a546332021-09-08 13:59:04 -0700179 output_file.write(('<file-name contentId="%s">%s</file-name>\n' % (id_table.get(filename), stripped_filename)).encode())
180 output_file.write(b"\n\n")
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900181
182 processed_file_keys = []
183 # Output the individual notice file lists
184 for filename in sorted_filenames:
185 file_key = id_table.get(filename)
186 if file_key in processed_file_keys:
187 continue
188 processed_file_keys.append(file_key)
189
Christopher Ferris6a546332021-09-08 13:59:04 -0700190 output_file.write(('<file-content contentId="%s"><![CDATA[' % file_key).encode())
191 with open(filename, "rb") as notice_file:
192 output_file.write(html_escape(notice_file.read()))
193 output_file.write(b"]]></file-content>\n\n")
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900194
195 # Finish off the file output
Christopher Ferris6a546332021-09-08 13:59:04 -0700196 output_file.write(b"</licenses>\n")
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900197 output_file.close()
198
199def get_args():
200 parser = argparse.ArgumentParser()
201 parser.add_argument(
202 '--text-output', required=True,
203 help='The text output file path.')
204 parser.add_argument(
205 '--html-output',
206 help='The html output file path.')
207 parser.add_argument(
208 '--xml-output',
209 help='The xml output file path.')
210 parser.add_argument(
211 '-t', '--title', required=True,
212 help='The file title.')
213 parser.add_argument(
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700214 '-s', '--source-dir', required=True, action='append',
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900215 help='The directory containing notices.')
216 parser.add_argument(
217 '-i', '--included-subdirs', action='append',
218 help='The sub directories which should be included.')
219 parser.add_argument(
220 '-e', '--excluded-subdirs', action='append',
221 help='The sub directories which should be excluded.')
222 return parser.parse_args()
223
224def main(argv):
225 args = get_args()
226
227 txt_output_file = args.text_output
228 html_output_file = args.html_output
229 xml_output_file = args.xml_output
230 file_title = args.title
231 included_subdirs = []
232 excluded_subdirs = []
233 if args.included_subdirs is not None:
234 included_subdirs = args.included_subdirs
235 if args.excluded_subdirs is not None:
236 excluded_subdirs = args.excluded_subdirs
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400237
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700238 input_dirs = [os.path.normpath(source_dir) for source_dir in args.source_dir]
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400239 # Find all the notice files and md5 them
Mateusz Zięba53716072021-04-13 12:33:42 -0700240 files_with_same_hash = defaultdict(list)
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700241 for input_dir in input_dirs:
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700242 for root, dir, files in os.walk(input_dir):
243 for file in files:
244 matched = True
245 if len(included_subdirs) > 0:
246 matched = False
247 for subdir in included_subdirs:
248 if (root == (input_dir + '/' + subdir) or
249 root.startswith(input_dir + '/' + subdir + '/')):
250 matched = True
251 break
252 elif len(excluded_subdirs) > 0:
253 for subdir in excluded_subdirs:
254 if (root == (input_dir + '/' + subdir) or
255 root.startswith(input_dir + '/' + subdir + '/')):
256 matched = False
257 break
258 if matched and file.endswith(".txt"):
259 filename = os.path.join(root, file)
260 file_md5sum = md5sum(filename)
261 files_with_same_hash[file_md5sum].append(filename)
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400262
Christopher Ferris6a546332021-09-08 13:59:04 -0700263 filesets = [sorted(files_with_same_hash[md5]) for md5 in sorted(list(files_with_same_hash))]
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700264 combine_notice_files_text(filesets, input_dirs, txt_output_file, file_title)
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400265
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900266 if html_output_file is not None:
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700267 combine_notice_files_html(filesets, input_dirs, html_output_file)
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900268
269 if xml_output_file is not None:
Bob Badour5e9e1fb2020-07-17 20:47:42 -0700270 combine_notice_files_xml(files_with_same_hash, input_dirs, xml_output_file)
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900271
Daniel Berlinf5a97d72012-03-29 10:33:19 -0400272if __name__ == "__main__":
Jaekyun Seok3b7560b2017-04-19 15:26:47 +0900273 main(sys.argv)