211 lines
7 KiB
Python
211 lines
7 KiB
Python
|
#! /usr/bin/env python3
|
||
|
|
||
|
#
|
||
|
# Extract files from Bare git-annex repositories without git-annex
|
||
|
# Supports version v6
|
||
|
#
|
||
|
# See internals: http://git-annex.branchable.com/internals/
|
||
|
#
|
||
|
# Modified: added non-bare repos, added tar file (of symlinks) output for use with archivemount
|
||
|
#
|
||
|
# TODO: improve output
|
||
|
# TODO: use cat-files instead of archive
|
||
|
# TODO: export to tar WITH relative links
|
||
|
#
|
||
|
# Emanuele Ruffaldi (C) 2016
|
||
|
import sys,argparse,os,subprocess
|
||
|
import tarfile,hashlib,struct
|
||
|
import tarfile,io,hashlib,struct
|
||
|
from hashlib import md5
|
||
|
|
||
|
def gitgetpathinfo(branch,path,recurse=False):
|
||
|
"""uses ls-tree to extract information about a path in the branch or in general tree-ish"""
|
||
|
if recurse:
|
||
|
r = "-r"
|
||
|
else:
|
||
|
r = ""
|
||
|
w = subprocess.check_output(["git", "ls-tree",r,branch,"--",path])
|
||
|
w = w.decode("utf-8")
|
||
|
return [pa.split("\t") for pa in w.split("\n") if pa != ""] # meta TAB filename ==> meta is: ?? SPACE type
|
||
|
|
||
|
def tarextraclink(content):
|
||
|
"""extracts the path of a link in a Tar expressed by content"""
|
||
|
t = tarfile.open(mode="r",fileobj=io.BytesIO(content))
|
||
|
ti = t.getmembers()[0]
|
||
|
return ti.linkname
|
||
|
|
||
|
def gitgetfile(branch,path):
|
||
|
"""uses archive for extracing the path. This is better than the git show solution because it deals with diff automatically. But does not work with symbolic links"""
|
||
|
xpath,n = os.path.split(path)
|
||
|
xx = "git archive --format=tar --prefix= \"%s:%s\" \"%s\" | tar -xO \"%s\"" % (branch,xpath,n,n)
|
||
|
return subprocess.check_output(xx,shell=True)
|
||
|
|
||
|
def gitgetfile_tar(branch,path):
|
||
|
"""returns the content of a file in tar format"""
|
||
|
try:
|
||
|
xpath,n = os.path.split(path)
|
||
|
xx = "git archive --format=tar --prefix= \"%s:%s\" \"%s\"" % (branch,xpath,n)
|
||
|
return(subprocess.check_output(xx,shell="True"))
|
||
|
except:
|
||
|
return None
|
||
|
|
||
|
def gitgetfile_show(branch,path):
|
||
|
"""retrieve path content: first getting the hash and then the content via git show"""
|
||
|
found = gitgetpathinfo(branch,path)
|
||
|
if len(found) == 1:
|
||
|
return subprocess.check_output(["git", "show",found[0][0].split(" ")[2]])
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
def annexgetremotes(useshow):
|
||
|
"""list of remotes AKA uuid.log"""
|
||
|
if useshow:
|
||
|
return gitgetfile_show("git-annex","uuid.log")
|
||
|
else: # slow with bare
|
||
|
return gitgetfile("git-annex","uuid.log")
|
||
|
|
||
|
#https://gist.github.com/giomasce/a7802bda1417521c5b30
|
||
|
def hashdirlower(key):
|
||
|
hasher = hashlib.md5()
|
||
|
hasher.update(key.encode("utf-8"))
|
||
|
digest = hasher.hexdigest()
|
||
|
return "%s/%s/" % (digest[:3], digest[3:6])
|
||
|
|
||
|
#https://gist.github.com/giomasce/a7802bda1417521c5b30
|
||
|
def hashdirmixed(key):
|
||
|
hasher = hashlib.md5()
|
||
|
hasher.update(key)
|
||
|
digest = hasher.digest()
|
||
|
first_word = struct.unpack('<I', digest[:4])[0]
|
||
|
nums = [first_word >> (6 * x) & 31 for x in range(4)]
|
||
|
letters = ["0123456789zqjxkmvwgpfZQJXKMVWGPF"[i] for i in nums]
|
||
|
return "%s%s/%s%s/" % (letters[1], letters[0], letters[3], letters[2])
|
||
|
|
||
|
def annexwhereis_bare(key):
|
||
|
"""returns the location of the key object of git-annex"""
|
||
|
#hashdirlower is used for bare git repositories, the git-annex branch, and on special remotes as well.
|
||
|
#m = md5.new()
|
||
|
#m.update(key)
|
||
|
#h = m.hexdigest()
|
||
|
#pre = h[0:3]
|
||
|
#post = h[3:6]
|
||
|
#print key,pre,post
|
||
|
papa = hashdirlower(key)
|
||
|
return gitgetfile("git-annex",os.path.join(papa,key+".log")),os.path.join("annex","objects",papa,key,key)
|
||
|
|
||
|
def annexwhereis(key):
|
||
|
"""returns the location of the key object of git-annex"""
|
||
|
#non bare uses hashdirmixed
|
||
|
#It takes the md5sum of the key, but rather than a string, represents it as 4 32bit words. Only the first word is used. It is converted into a string by the same mechanism that would be used to encode a normal md5sum value into a string, but where that would normally encode the bits using the 16 characters 0-9a-f, this instead uses the 32 characters "0123456789zqjxkmvwgpfZQJXKMVWGPF". The first 2 letters of the resulting string are the first directory, and the second 2 are the second directory.
|
||
|
papaM = hashdirmixed(key)
|
||
|
papaL = hashdirlower(key)
|
||
|
return gitgetfile("git-annex",os.path.join(papaL,key+".log")),os.path.join("annex","objects",papaM,key,key)
|
||
|
|
||
|
def checkbare(args):
|
||
|
"""checks if the repo is a bare"""
|
||
|
gitdir = os.path.join(args.annex,".git")
|
||
|
if os.path.isdir(gitdir):
|
||
|
if not os.path.isdir(os.path.join(gitdir,"annex")):
|
||
|
return None
|
||
|
else:
|
||
|
return False,gitdir
|
||
|
elif os.path.isdir(os.path.join(args.annex,"annex")):
|
||
|
gitdir = args.annex
|
||
|
return True,gitdir
|
||
|
else:
|
||
|
return None
|
||
|
|
||
|
def main():
|
||
|
|
||
|
parser = argparse.ArgumentParser(description='Retrieve file from git-annex, even barebone')
|
||
|
parser.add_argument('--annex', help="path to annex repository",default=".")
|
||
|
parser.add_argument('path', help="file to be looked at",nargs="*")
|
||
|
parser.add_argument('--all', help="list all",action="store_true")
|
||
|
parser.add_argument('--verbose', help="verbose dump",action="store_true")
|
||
|
parser.add_argument('--tar', help="produces a tar file with given path cotaining the symbolic links")
|
||
|
parser.add_argument('--abs',help="makes abs files",action="store_true")
|
||
|
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
# check if bare repository
|
||
|
isbare = checkbare(args)
|
||
|
if isbare is None:
|
||
|
print("not a git-annex repisitory")
|
||
|
isbare,gitdir = isbare
|
||
|
print("isbare?",isbare,gitdir)
|
||
|
|
||
|
if not isbare:
|
||
|
workdir = args.annex
|
||
|
else:
|
||
|
workdir = None
|
||
|
|
||
|
os.environ["GIT_DIR"] = gitdir
|
||
|
print("list annexes\n",annexgetremotes(useshow=False))
|
||
|
|
||
|
if args.tar:
|
||
|
ot = tarfile.open(args.tar,"w")
|
||
|
|
||
|
if args.all:
|
||
|
args.path = [x[1] for x in gitgetpathinfo("edge",".",recurse=True)]
|
||
|
|
||
|
for p in args.path:
|
||
|
# we cannot use
|
||
|
ww = gitgetfile_tar("edge",p) # tarred 1 file
|
||
|
if ww is None:
|
||
|
print("not found",p)
|
||
|
continue
|
||
|
link = tarextraclink(ww) # extract the link from the single file
|
||
|
if args.verbose:
|
||
|
print("aslink",link)
|
||
|
#w = gitgetfile("edge",p) -- not working using tar because it is a link
|
||
|
#ref = gitgetfile_show("edge",p) -- not working in theory
|
||
|
ref = link
|
||
|
if ref == "":
|
||
|
print("not found",p)
|
||
|
else:
|
||
|
key = os.path.split(ref)[1] # the link contains the annex key
|
||
|
if args.verbose:
|
||
|
print("key is",key)
|
||
|
if isbare:
|
||
|
locations,path = annexwhereis_bare(key) # extract
|
||
|
else:
|
||
|
locations,path = annexwhereis(key)
|
||
|
path = os.path.join(gitdir,path)
|
||
|
|
||
|
if args.verbose:
|
||
|
print(p,"located in\n",locations)
|
||
|
if not os.path.isfile(path):
|
||
|
if not isbare:
|
||
|
if os.path.isfile(path+".map"):
|
||
|
mpath = os.path.join(workdir,open(path+".map","r").read().strip())
|
||
|
if os.path.isfile(mpath):
|
||
|
path = mpath
|
||
|
else:
|
||
|
print("mapped file not found",mpath," for ",path) # or direct mode not supported
|
||
|
path = None
|
||
|
else:
|
||
|
print("non bare file not found",path) # or direct mode not supported
|
||
|
path = None
|
||
|
else:
|
||
|
print("file not found",path) # or direct mode not supported
|
||
|
path = None
|
||
|
if path is not None:
|
||
|
ss = os.stat(path)
|
||
|
if args.verbose:
|
||
|
print(path,ss)
|
||
|
ti = tarfile.TarInfo(p)
|
||
|
ti.size = 0 # zero for links: ss.st_size
|
||
|
ti.mode = ss.st_mode
|
||
|
ti.mtime = ss.st_mtime
|
||
|
ti.type = tarfile.SYMTYPE
|
||
|
ti.uid = ss.st_uid
|
||
|
ti.gid = ss.st_gid
|
||
|
if args.abs:
|
||
|
ti.linkname = os.path.abspath(path)
|
||
|
else:
|
||
|
ti.linkname = path
|
||
|
ot.addfile(ti)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|