Add IndexedArchive

This type will archive multiple files de-duplicated in a zip with a
central manifest that describes how to recreate the actual layout.
In addition, any other zip file will be expanded and deduplicated
so that we can further reduce the size of those zips/nupkgs.
All of these are placed in a zip with no-compression, essentially
using the zip as only a container.  We then LZMA compress that
container to achieve maximum compression.
This commit is contained in:
Eric St. John 2016-06-04 01:13:36 -07:00 committed by Livar Cunha
parent e8a65dd546
commit 40bf17900a
5 changed files with 786 additions and 1 deletions

View file

@ -0,0 +1,108 @@
using SevenZip;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Microsoft.DotNet.Archive
{
internal static class CompressionUtility
{
enum MeasureBy
{
Input,
Output
}
private class LzmaProgress : ICodeProgress
{
private IProgress<ProgressReport> progress;
private long totalSize;
private string phase;
private MeasureBy measureBy;
public LzmaProgress(IProgress<ProgressReport> progress, string phase, long totalSize, MeasureBy measureBy)
{
this.progress = progress;
this.totalSize = totalSize;
this.phase = phase;
this.measureBy = measureBy;
}
public void SetProgress(long inSize, long outSize)
{
progress.Report(phase, measureBy == MeasureBy.Input ? inSize : outSize, totalSize);
}
}
public static void Compress(Stream inStream, Stream outStream, IProgress<ProgressReport> progress)
{
SevenZip.Compression.LZMA.Encoder encoder = new SevenZip.Compression.LZMA.Encoder();
CoderPropID[] propIDs =
{
CoderPropID.DictionarySize,
CoderPropID.PosStateBits,
CoderPropID.LitContextBits,
CoderPropID.LitPosBits,
CoderPropID.Algorithm,
CoderPropID.NumFastBytes,
CoderPropID.MatchFinder,
CoderPropID.EndMarker
};
object[] properties =
{
(Int32)(1 << 26),
(Int32)(1),
(Int32)(8),
(Int32)(0),
(Int32)(2),
(Int32)(96),
"bt4",
false
};
encoder.SetCoderProperties(propIDs, properties);
encoder.WriteCoderProperties(outStream);
Int64 inSize = inStream.Length;
for (int i = 0; i < 8; i++)
{
outStream.WriteByte((Byte)(inSize >> (8 * i)));
}
var lzmaProgress = new LzmaProgress(progress, "Compressing", inSize, MeasureBy.Input);
lzmaProgress.SetProgress(0, 0);
encoder.Code(inStream, outStream, -1, -1, lzmaProgress);
lzmaProgress.SetProgress(inSize, outStream.Length);
}
public static void Decompress(Stream inStream, Stream outStream, IProgress<ProgressReport> progress)
{
byte[] properties = new byte[5];
if (inStream.Read(properties, 0, 5) != 5)
throw (new Exception("input .lzma is too short"));
SevenZip.Compression.LZMA.Decoder decoder = new SevenZip.Compression.LZMA.Decoder();
decoder.SetDecoderProperties(properties);
long outSize = 0;
for (int i = 0; i < 8; i++)
{
int v = inStream.ReadByte();
if (v < 0)
throw (new Exception("Can't Read 1"));
outSize |= ((long)(byte)v) << (8 * i);
}
long compressedSize = inStream.Length - inStream.Position;
var lzmaProgress = new LzmaProgress(progress, "Decompressing", outSize, MeasureBy.Output);
lzmaProgress.SetProgress(0, 0);
decoder.Code(inStream, outStream, compressedSize, outSize, lzmaProgress);
lzmaProgress.SetProgress(inStream.Length, outSize);
}
}
}

View file

@ -0,0 +1,584 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace Microsoft.DotNet.Archive
{
public class IndexedArchive : IDisposable
{
private class DestinationFileInfo
{
public DestinationFileInfo(string destinationPath, string hash)
{
DestinationPath = destinationPath;
Hash = hash;
}
public string DestinationPath { get; }
public string Hash { get; }
}
private class ArchiveFileInfo
{
public ArchiveFileInfo(Stream stream, string archivePath, string hash)
{
Stream = stream;
ArchivePath = archivePath;
Hash = hash;
}
public Stream Stream { get; set; }
public string ArchivePath { get; }
public string Hash { get; }
public string FileName { get { return Path.GetFileNameWithoutExtension(ArchivePath); } }
public string Extension { get { return Path.GetExtension(ArchivePath); } }
public long Size { get { return Stream.Length; } }
}
static string[] ZipExtensions = new[] { ".zip", ".nupkg" };
static string IndexFileName = "index.txt";
// maps file hash to archve path
// $ prefix indicates that the file is not in the archive and path is a hash
Dictionary<string, ArchiveFileInfo> archiveFiles = new Dictionary<string, ArchiveFileInfo>();
// maps file hash to external path
Dictionary<string, string> externalFiles = new Dictionary<string, string>();
// lists all extracted files & hashes
List<DestinationFileInfo> destFiles = new List<DestinationFileInfo>();
bool disposed = false;
ThreadLocal<SHA256> sha = new ThreadLocal<SHA256>(() => SHA256.Create());
public IndexedArchive()
{
}
private static Stream CreateTemporaryStream()
{
// return new MemoryStream();
string temp = Path.GetTempPath();
string tempFile = Path.Combine(temp, Guid.NewGuid().ToString());
return File.Create(tempFile, 4096, FileOptions.DeleteOnClose);
}
private static FileStream CreateTemporaryFileStream()
{
string temp = Path.GetTempPath();
string tempFile = Path.Combine(temp, Guid.NewGuid().ToString());
//return File.Create(tempFile, 4096, FileOptions.DeleteOnClose);
return new FileStream(tempFile, FileMode.Create, FileAccess.ReadWrite, FileShare.Read | FileShare.Delete, 4096, FileOptions.DeleteOnClose);
}
public void Save(string archivePath, IProgress<ProgressReport> progress)
{
CheckDisposed();
//using (var archiveStream = CreateTemporaryStream())
using (var archiveStream = File.Create(archivePath + ".zip"))
{
using (var archive = new ZipArchive(archiveStream, ZipArchiveMode.Create, true))
{
BuildArchive(archive, progress);
} // close archive
archiveStream.Seek(0, SeekOrigin.Begin);
using (var lzmaStream = File.Create(archivePath))
{
CompressionUtility.Compress(archiveStream, lzmaStream, progress);
}
} // close archiveStream
}
private void BuildArchive(ZipArchive archive, IProgress<ProgressReport> progress)
{
// write the file index
var indexEntry = archive.CreateEntry(IndexFileName, CompressionLevel.NoCompression);
using (var stream = indexEntry.Open())
using (var textWriter = new StreamWriter(stream))
{
foreach (var entry in destFiles)
{
var archiveFile = archiveFiles[entry.Hash];
string archivePath = archiveFiles[entry.Hash].ArchivePath;
if (archiveFile.Stream == null)
{
archivePath = "$" + archivePath;
}
textWriter.WriteLine($"{entry.DestinationPath}|{archivePath}");
}
}
// sort the files so that similar files are close together
var filesToArchive = archiveFiles.Values.ToList();
filesToArchive.Sort((f1, f2) =>
{
// first sort by extension
var comp = String.Compare(f1.Extension, f2.Extension, StringComparison.OrdinalIgnoreCase);
if (comp == 0)
{
// then sort by filename
comp = String.Compare(f1.FileName, f2.FileName, StringComparison.OrdinalIgnoreCase);
}
if (comp == 0)
{
// sort by file size (helps differentiate ref/lib/facade)
comp = f1.Size.CompareTo(f2.Size);
}
if (comp == 0)
{
// finally sort by full archive path so we have stable output
comp = String.Compare(f1.ArchivePath, f2.ArchivePath, StringComparison.OrdinalIgnoreCase);
}
return comp;
});
int filesAdded = 0;
// add all the files
foreach (var fileToArchive in filesToArchive)
{
var entry = archive.CreateEntry(fileToArchive.ArchivePath, CompressionLevel.NoCompression);
using (var entryStream = entry.Open())
{
fileToArchive.Stream.CopyTo(entryStream);
fileToArchive.Stream.Dispose();
fileToArchive.Stream = null;
}
progress.Report("Archiving files", ++filesAdded, filesToArchive.Count);
}
}
private abstract class ExtractOperation
{
public ExtractOperation(string destinationPath)
{
DestinationPath = destinationPath;
}
public string DestinationPath { get; }
public virtual void DoOperation()
{
string directory = Path.GetDirectoryName(DestinationPath);
if (!Directory.Exists(directory))
{
Directory.CreateDirectory(directory);
}
Execute();
}
protected abstract void Execute();
}
private class CopyOperation : ExtractOperation
{
public CopyOperation(ExtractSource source, string destinationPath) : base(destinationPath)
{
Source = source;
}
public ExtractSource Source { get; }
protected override void Execute()
{
if (Source.LocalPath != null)
{
File.Copy(Source.LocalPath, DestinationPath, true);
}
else
{
using (var destinationStream = File.Create(DestinationPath))
{
Source.CopyToStream(destinationStream);
}
}
}
}
private class ZipOperation : ExtractOperation
{
public ZipOperation(string destinationPath) : base(destinationPath)
{
}
private List<Tuple<string, ExtractSource>> entries = new List<Tuple<string, ExtractSource>>();
public void AddEntry(string entryName, ExtractSource source)
{
entries.Add(Tuple.Create(entryName, source));
}
protected override void Execute()
{
using (var archiveStream = File.Create(DestinationPath))
using (var archive = new ZipArchive(archiveStream, ZipArchiveMode.Create))
{
foreach(var zipSource in entries)
{
var entry = archive.CreateEntry(zipSource.Item1, CompressionLevel.Optimal);
using (var entryStream = entry.Open())
{
zipSource.Item2.CopyToStream(entryStream);
}
}
}
}
}
private class ExtractSource
{
private string _entryName;
private string _localPath;
private ThreadLocalZipArchive _archive;
public ExtractSource(string sourceString, Dictionary<string, string> externalFiles, ThreadLocalZipArchive archive)
{
if (sourceString[0] == '$')
{
var externalHash = sourceString.Substring(1);
if (!externalFiles.TryGetValue(externalHash, out _localPath))
{
throw new Exception("Could not find external file with hash {externalHash}.");
}
}
else
{
_entryName = sourceString;
_archive = archive;
}
}
public string LocalPath { get { return _localPath; } }
public void CopyToStream(Stream destinationStream)
{
if (_localPath != null)
{
using (var sourceStream = File.OpenRead(_localPath))
{
sourceStream.CopyTo(destinationStream);
}
}
else
{
// we open the archive each time since ZipArchive is not thread safe and we want to be able
// to extract from many threads
//using (var archive = new ZipArchive(File.Open(_archivePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite | FileShare.Delete)))
using (var sourceStream = _archive.Archive.GetEntry(_entryName).Open())
{
sourceStream.CopyTo(destinationStream);
var destinationFileStream = destinationStream as FileStream;
if (destinationFileStream != null)
{
// Set Local path so that the next copy operation using the same source will
// do a copy instead of a write.
_localPath = destinationFileStream.Name;
}
}
}
}
}
private static char[] pipeSeperator = new[] { '|' };
public void Extract(string compressedArchivePath, string outputDirectory, IProgress<ProgressReport> progress)
{
using (var archiveStream = CreateTemporaryFileStream())
{
// decompress the LZMA stream
using (var lzmaStream = File.OpenRead(compressedArchivePath))
{
CompressionUtility.Decompress(lzmaStream, archiveStream, progress);
}
var archivePath = ((FileStream)archiveStream).Name;
// reset the uncompressed stream
archiveStream.Seek(0, SeekOrigin.Begin);
// read as a zip archive
using (var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read))
using (var tlArchive = new ThreadLocalZipArchive(archivePath, archive))
{
List<ExtractOperation> extractOperations = new List<ExtractOperation>();
Dictionary<string, ExtractSource> sourceCache = new Dictionary<string, ExtractSource>();
// process the index to determine all extraction operations
var indexEntry = archive.GetEntry(IndexFileName);
using (var indexReader = new StreamReader(indexEntry.Open()))
{
Dictionary<string, ZipOperation> zipOperations = new Dictionary<string, ZipOperation>(StringComparer.OrdinalIgnoreCase);
for (var line = indexReader.ReadLine(); line != null; line = indexReader.ReadLine())
{
var lineParts = line.Split(pipeSeperator);
if (lineParts.Length != 2)
{
throw new Exception("Unexpected index line format, too many '|'s.");
}
string target = lineParts[0];
string source = lineParts[1];
ExtractSource extractSource;
if (!sourceCache.TryGetValue(source, out extractSource))
{
sourceCache[source] = extractSource = new ExtractSource(source, externalFiles, tlArchive);
}
var zipSeperatorIndex = target.IndexOf("::", StringComparison.OrdinalIgnoreCase);
if (zipSeperatorIndex != -1)
{
string zipRelativePath = target.Substring(0, zipSeperatorIndex);
string zipEntryName = target.Substring(zipSeperatorIndex + 2);
string destinationPath = Path.Combine(outputDirectory, zipRelativePath);
// operations on a zip file will be sequential
ZipOperation currentZipOperation;
if (!zipOperations.TryGetValue(destinationPath, out currentZipOperation))
{
extractOperations.Add(currentZipOperation = new ZipOperation(destinationPath));
zipOperations.Add(destinationPath, currentZipOperation);
}
currentZipOperation.AddEntry(zipEntryName, extractSource);
}
else
{
string destinationPath = Path.Combine(outputDirectory, target);
extractOperations.Add(new CopyOperation(extractSource, destinationPath));
}
}
}
int opsExecuted = 0;
// execute all operations
//foreach(var extractOperation in extractOperations)
extractOperations.AsParallel().ForAll(extractOperation =>
{
extractOperation.DoOperation();
progress.Report("Expanding", Interlocked.Increment(ref opsExecuted), extractOperations.Count);
});
}
}
}
public void AddExternalDirectory(string externalDirectory)
{
CheckDisposed();
foreach (var externalFile in Directory.EnumerateFiles(externalDirectory, "*", SearchOption.AllDirectories))
{
AddExternalFile(externalFile);
}
}
public void AddExternalFile(string externalFile)
{
CheckDisposed();
using (var fs = File.OpenRead(externalFile))
{
string hash = GetHash(fs);
// $ prefix indicates that the file is not in the archive and path is relative to an external directory
archiveFiles[hash] = new ArchiveFileInfo(null, "$" + hash , hash);
externalFiles[hash] = externalFile;
}
}
public void AddDirectory(string sourceDirectory, IProgress<ProgressReport> progress, string destinationDirectory = null)
{
var sourceFiles = Directory.EnumerateFiles(sourceDirectory, "*", SearchOption.AllDirectories).ToArray();
int filesAdded = 0;
sourceFiles.AsParallel().ForAll(sourceFile =>
{
string destinationPath = sourceFile.Substring(sourceDirectory.Length + 1);
if (destinationDirectory != null)
{
destinationPath = Path.Combine(destinationDirectory, destinationPath);
}
string extension = Path.GetExtension(sourceFile);
if (ZipExtensions.Any(ze => ze.Equals(extension, StringComparison.OrdinalIgnoreCase)))
{
AddZip(sourceFile, destinationPath);
}
else
{
AddFile(sourceFile, destinationPath);
}
progress.Report($"Adding {sourceDirectory}", ++filesAdded, sourceFiles.Length);
});
}
public void AddZip(string sourceZipFile, string destinationZipFile)
{
using (var sourceArchive = new ZipArchive(File.OpenRead(sourceZipFile), ZipArchiveMode.Read))
{
foreach(var entry in sourceArchive.Entries)
{
// we can dispose this stream, if AddStream uses it, it will make a copy.
using (var stream = entry.Open())
{
string destinationPath = $"{destinationZipFile}::{entry.FullName}";
AddStream(stream, destinationPath);
}
}
}
}
public void AddFile(string sourceFilePath, string destinationPath)
{
// lifetime of this stream is managed by AddStream
var stream = File.Open(sourceFilePath, FileMode.Open);
AddStream(stream, destinationPath);
}
public void AddStream(Stream stream, string destinationPath)
{
CheckDisposed();
string hash = null;
if (stream.CanSeek)
{
hash = GetHash(stream);
}
else
{
var copy = CreateTemporaryStream();
#if NET45
hash = CopyWithHash(stream, copy);
#else
stream.CopyTo(copy);
copy.Seek(0, SeekOrigin.Begin);
hash = GetHash(copy);
#endif
stream.Dispose();
stream = copy;
}
lock (archiveFiles)
{
destFiles.Add(new DestinationFileInfo(destinationPath, hash));
// see if we already have this file in the archive/external
ArchiveFileInfo existing = null;
if (archiveFiles.TryGetValue(hash, out existing))
{
// reduce memory pressure
if (!(stream is MemoryStream) && (existing.Stream is MemoryStream))
{
// dispose memory stream
existing.Stream.Dispose();
stream.Seek(0, SeekOrigin.Begin);
existing.Stream = stream;
}
else
{
// we already have a good stream, free this one.
stream.Dispose();
}
}
else
{
// add a new entry;
stream.Seek(0, SeekOrigin.Begin);
var archivePath = Path.Combine(hash, Path.GetFileName(destinationPath));
archiveFiles.Add(hash, new ArchiveFileInfo(stream, archivePath, hash));
}
}
}
#if NET45
/// <summary>
/// Calculates the hash while copying the file to avoid multiple reads
/// </summary>
private const int _DefaultCopyBufferSize = 81920;
public string CopyWithHash(Stream source, Stream destination)
{
byte[] buffer = new byte[_DefaultCopyBufferSize];
int read;
while ((read = source.Read(buffer, 0, buffer.Length)) != 0)
{
sha.Value.TransformBlock(buffer, 0, read, null, 0);
destination.Write(buffer, 0, read);
}
sha.Value.TransformFinalBlock(buffer, 0, 0);
var hash = sha.Value.Hash;
// follow pattern in ComputeHash(stream) where it re-initializes after finishing.
sha.Value.Initialize();
return GetHashString(hash);
}
#endif
public string GetHash(Stream stream)
{
var hashBytes = sha.Value.ComputeHash(stream);
return GetHashString(hashBytes);
}
private static string GetHashString(byte[] hashBytes)
{
StringBuilder builder = new StringBuilder(hashBytes.Length * 2);
foreach (var b in hashBytes)
{
builder.AppendFormat("{0:x2}", b);
}
return builder.ToString();
}
public void Dispose()
{
if (!disposed)
{
if (archiveFiles != null)
{
foreach(var archiveFile in archiveFiles.Values)
{
if (archiveFile.Stream != null)
{
archiveFile.Stream.Dispose();
archiveFile.Stream = null;
}
}
}
if (sha != null)
{
sha.Dispose();
sha = null;
}
}
}
private void CheckDisposed()
{
if (disposed)
{
throw new ObjectDisposedException(nameof(IndexedArchive));
}
}
}
}

View file

@ -0,0 +1,30 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Microsoft.DotNet.Archive
{
public struct ProgressReport
{
public string Phase;
public long Ticks;
public long Total;
}
public static class ProgressReportExtensions
{
public static void Report(this IProgress<ProgressReport> progress, string phase, long ticks, long total)
{
progress.Report(new ProgressReport()
{
Phase = phase,
Ticks = ticks,
Total = total
});
}
}
}

View file

@ -0,0 +1,58 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace Microsoft.DotNet.Archive
{
// wraps ThreadLocal<ZipArchive> and exposes Dispose semantics that dispose all archives
class ThreadLocalZipArchive : IDisposable
{
private ThreadLocal<ZipArchive> _archive;
private bool disposed = false;
public ThreadLocalZipArchive(string archivePath, ZipArchive local = null)
{
_archive = new ThreadLocal<ZipArchive>(() =>
new ZipArchive(File.Open(archivePath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite | FileShare.Delete), ZipArchiveMode.Read),
trackAllValues:true);
if (local != null)
{
// reuse provided one for current thread
_archive.Value = local;
}
}
public ZipArchive Archive { get { return _archive.Value; } }
public void Dispose()
{
if (!disposed)
{
if (_archive != null)
{
// dispose all archives
if (_archive.Values != null)
{
foreach (var value in _archive.Values)
{
if (value != null)
{
value.Dispose();
}
}
}
// dispose ThreadLocal
_archive.Dispose();
_archive = null;
}
}
}
}
}

View file

@ -8,7 +8,12 @@
"NETStandard.Library": "1.6.0-rc3-24201-00"
},
"frameworks": {
"netstandard1.0": {}
"net45": {},
"netstandard1.3": {
"dependencies": {
"System.Linq.Parallel": "4.0.1-rc3-24201-00"
}
}
},
"scripts": {}
}