zotero/app/scripts/optimizejars.py

377 lines
14 KiB
Python
Raw Normal View History

# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is mozilla.org code
#
# The Initial Developer of the Original Code is
# Mozilla Foundation.
# Portions created by the Initial Developer are Copyright (C) 2010
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Taras Glek <tglek@mozilla.com>
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK *****
import sys, os, subprocess, struct, re
local_file_header = [
("signature", "uint32"),
("min_version", "uint16"),
("general_flag", "uint16"),
("compression", "uint16"),
("lastmod_time", "uint16"),
("lastmod_date", "uint16"),
("crc32", "uint32"),
("compressed_size", "uint32"),
("uncompressed_size", "uint32"),
("filename_size", "uint16"),
("extra_field_size", "uint16"),
("filename", "filename_size"),
("extra_field", "extra_field_size"),
("data", "compressed_size")
]
cdir_entry = [
("signature", "uint32"),
("creator_version", "uint16"),
("min_version", "uint16"),
("general_flag", "uint16"),
("compression", "uint16"),
("lastmod_time", "uint16"),
("lastmod_date", "uint16"),
("crc32", "uint32"),
("compressed_size", "uint32"),
("uncompressed_size", "uint32"),
("filename_size", "uint16"),
("extrafield_size", "uint16"),
("filecomment_size", "uint16"),
("disknum", "uint16"),
("internal_attr", "uint16"),
("external_attr", "uint32"),
("offset", "uint32"),
("filename", "filename_size"),
("extrafield", "extrafield_size"),
("filecomment", "filecomment_size"),
]
cdir_end = [
("signature", "uint32"),
("disk_num", "uint16"),
("cdir_disk", "uint16"),
("disk_entries", "uint16"),
("cdir_entries", "uint16"),
("cdir_size", "uint32"),
("cdir_offset", "uint32"),
("comment_size", "uint16"),
]
type_mapping = { "uint32":"I", "uint16":"H"}
def format_struct (format):
string_fields = {}
fmt = "<"
for (name,value) in iter(format):
try:
fmt += type_mapping[value][0]
except KeyError:
string_fields[name] = value
return (fmt, string_fields)
def size_of(format):
return struct.calcsize(format_struct(format)[0])
class MyStruct:
def __init__(self, format, string_fields):
self.__dict__["struct_members"] = {}
self.__dict__["format"] = format
self.__dict__["string_fields"] = string_fields
def addMember(self, name, value):
self.__dict__["struct_members"][name] = value
def __getattr__(self, item):
try:
return self.__dict__["struct_members"][item]
except:
pass
print("no %s" %item)
print(self.__dict__["struct_members"])
raise AttributeError
def __setattr__(self, item, value):
if item in self.__dict__["struct_members"]:
self.__dict__["struct_members"][item] = value
else:
raise AttributeError
def pack(self):
extra_data = b""
values = []
string_fields = self.__dict__["string_fields"]
struct_members = self.__dict__["struct_members"]
format = self.__dict__["format"]
for (name,_) in format:
if name in string_fields:
if not isinstance(struct_members[name], bytes):
struct_members[name] = struct_members[name].encode('utf-8')
extra_data = extra_data + struct_members[name]
else:
values.append(struct_members[name]);
return struct.pack(format_struct(format)[0], *values) + extra_data
ENDSIG = 0x06054b50
def assert_true(cond, msg):
if not cond:
raise Exception(msg)
exit(1)
class BinaryBlob:
def __init__(self, f):
self.data = open(f, "rb").read()
self.offset = 0
self.length = len(self.data)
def readAt(self, pos, length):
self.offset = pos + length
return self.data[pos:self.offset]
def read_struct (self, format, offset = None):
if offset == None:
offset = self.offset
(fstr, string_fields) = format_struct(format)
size = struct.calcsize(fstr)
data = self.readAt(offset, size)
ret = struct.unpack(fstr, data)
retstruct = MyStruct(format, string_fields)
i = 0
for (name,_) in iter(format):
member_desc = None
if not name in string_fields:
member_data = ret[i]
i = i + 1
else:
# zip has data fields which are described by other struct fields, this does
# additional reads to fill em in
member_desc = string_fields[name]
member_data = self.readAt(self.offset, retstruct.__getattr__(member_desc))
retstruct.addMember(name, member_data)
# sanity check serialization code
data = self.readAt(offset, self.offset - offset)
out_data = retstruct.pack()
assert_true(out_data == data, "Serialization fail %d !=%d"% (len(out_data), len(data)))
return retstruct
def optimizejar(jar, outjar, inlog = None):
if inlog is not None:
inlog = open(inlog).read().rstrip()
# in the case of an empty log still move the index forward
if len(inlog) == 0:
inlog = []
else:
inlog = inlog.split("\n")
outlog = []
jarblob = BinaryBlob(jar)
dirend = jarblob.read_struct(cdir_end, jarblob.length - size_of(cdir_end))
assert_true(dirend.signature == ENDSIG, "no signature in the end");
cdir_offset = dirend.cdir_offset
readahead = 0
if inlog is None and cdir_offset == 4:
readahead = struct.unpack("<I", jarblob.readAt(0, 4))[0]
print("%s: startup data ends at byte %d" % (outjar, readahead));
total_stripped = 0;
jarblob.offset = cdir_offset
central_directory = []
for i in range(0, dirend.cdir_entries):
entry = jarblob.read_struct(cdir_entry)
if entry.filename[-1:] == "/":
total_stripped += len(entry.pack())
else:
total_stripped += entry.extrafield_size
central_directory.append(entry)
reordered_count = 0
if inlog is not None:
dup_guard = set()
for ordered_name in inlog:
if ordered_name in dup_guard:
continue
else:
dup_guard.add(ordered_name)
found = False
for i in range(reordered_count, len(central_directory)):
if central_directory[i].filename == ordered_name:
# swap the cdir entries
tmp = central_directory[i]
central_directory[i] = central_directory[reordered_count]
central_directory[reordered_count] = tmp
reordered_count = reordered_count + 1
found = True
break
if not found:
print( "Can't find '%s' in %s" % (ordered_name, jar))
outfd = open(outjar, "wb")
out_offset = 0
if inlog is not None:
# have to put central directory at offset 4 cos 0 confuses some tools.
# This also lets us specify how many entries should be preread
dirend.cdir_offset = 4
# make room for central dir + end of dir + 4 extra bytes at front
out_offset = dirend.cdir_offset + dirend.cdir_size + size_of(cdir_end) - total_stripped
outfd.seek(out_offset)
cdir_data = b""
written_count = 0
crc_mapping = {}
dups_found = 0
dupe_bytes = 0
# store number of bytes suggested for readahead
for entry in central_directory:
# read in the header twice..first for comparison, second time for convenience when writing out
jarfile = jarblob.read_struct(local_file_header, entry.offset)
assert_true(jarfile.filename == entry.filename, "Directory/Localheader mismatch")
# drop directory entries
if entry.filename[-1:] == "/":
total_stripped += len(jarfile.pack())
dirend.cdir_entries -= 1
continue
# drop extra field data
else:
total_stripped += jarfile.extra_field_size;
entry.extrafield = jarfile.extra_field = ""
entry.extrafield_size = jarfile.extra_field_size = 0
# January 1st, 2010
entry.lastmod_date = jarfile.lastmod_date = ((2010 - 1980) << 9) | (1 << 5) | 1
entry.lastmod_time = jarfile.lastmod_time = 0
data = jarfile.pack()
outfd.write(data)
old_entry_offset = entry.offset
entry.offset = out_offset
out_offset = out_offset + len(data)
entry_data = entry.pack()
cdir_data += entry_data
expected_len = entry.filename_size + entry.extrafield_size + entry.filecomment_size
assert_true(len(entry_data) != expected_len,
"%s entry size - expected:%d got:%d" % (entry.filename, len(entry_data), expected_len))
written_count += 1
if entry.crc32 in crc_mapping:
dups_found += 1
dupe_bytes += entry.compressed_size + len(data) + len(entry_data)
print("%s\n\tis a duplicate of\n%s\n---"%(entry.filename, crc_mapping[entry.crc32]))
else:
crc_mapping[entry.crc32] = entry.filename;
if inlog is not None:
if written_count == reordered_count:
readahead = out_offset
print("%s: startup data ends at byte %d"%( outjar, readahead));
elif written_count < reordered_count:
pass
#print("%s @ %d" % (entry.filename, out_offset))
elif readahead >= old_entry_offset + len(data):
outlog.append(entry.filename)
reordered_count += 1
if inlog is None:
dirend.cdir_offset = out_offset
if dups_found > 0:
print("WARNING: Found %d duplicate files taking %d bytes"%(dups_found, dupe_bytes))
dirend.cdir_size = len(cdir_data)
dirend.disk_entries = dirend.cdir_entries
dirend_data = dirend.pack()
assert_true(size_of(cdir_end) == len(dirend_data), "Failed to serialize directory end correctly. Serialized size;%d, expected:%d"%(len(dirend_data), size_of(cdir_end)));
outfd.seek(dirend.cdir_offset)
outfd.write(cdir_data)
outfd.write(dirend_data)
# for ordered jars the central directory is written in the begining of the file, so a second central-directory
# entry has to be written in the end of the file
if inlog is not None:
outfd.seek(0)
outfd.write(struct.pack("<I", readahead));
outfd.seek(out_offset)
outfd.write(dirend_data)
print("Stripped %d bytes" % total_stripped)
print("%s %d/%d in %s" % (("Ordered" if inlog is not None else "Deoptimized"),
reordered_count, len(central_directory), outjar))
outfd.close()
return outlog
if len(sys.argv) != 5:
print("Usage: --optimize|--deoptimize %s JAR_LOG_DIR IN_JAR_DIR OUT_JAR_DIR" % sys.argv[0])
exit(1)
jar_regex = re.compile("\\.jar?$")
def optimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR):
ls = os.listdir(IN_JAR_DIR)
for jarfile in ls:
if not re.search(jar_regex, jarfile):
continue
injarfile = os.path.join(IN_JAR_DIR, jarfile)
outjarfile = os.path.join(OUT_JAR_DIR, jarfile)
logfile = os.path.join(JAR_LOG_DIR, jarfile + ".log")
if not os.path.isfile(logfile):
logfile = None
optimizejar(injarfile, outjarfile, logfile)
def deoptimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR):
if not os.path.exists(JAR_LOG_DIR):
os.makedirs(JAR_LOG_DIR)
ls = os.listdir(IN_JAR_DIR)
for jarfile in ls:
if not re.search(jar_regex, jarfile):
continue
injarfile = os.path.join(IN_JAR_DIR, jarfile)
outjarfile = os.path.join(OUT_JAR_DIR, jarfile)
logfile = os.path.join(JAR_LOG_DIR, jarfile + ".log")
log = str(optimizejar(injarfile, outjarfile, None))
open(logfile, "wb").write("\n".join(log).encode('utf-8'))
def main():
MODE = sys.argv[1]
JAR_LOG_DIR = sys.argv[2]
IN_JAR_DIR = sys.argv[3]
OUT_JAR_DIR = sys.argv[4]
if MODE == "--optimize":
optimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR)
elif MODE == "--deoptimize":
deoptimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR)
else:
print("Unknown mode %s" % MODE)
exit(1)
if __name__ == '__main__':
main()