From 0f51180097d5d0c6a5e72ff3f32cd0e5ac80bcc5 Mon Sep 17 00:00:00 2001 From: Antoine Martin Date: Tue, 13 Aug 2024 13:08:18 -0400 Subject: [PATCH] Add source information --- README.md | 32 ++++++++ annexgetfile.py | 210 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 242 insertions(+) create mode 100644 README.md create mode 100644 annexgetfile.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..783fdde --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +# annexgetfile +Upstream: https://ayakael.net/forge/annexgetfile + +## Description + +Script to extract name and location information out of git-annex repo + +Based on: https://gist.github.com/eruffaldi/924f6b53a63dede6e59f + +## How to use + + +```shell +usage: annexgetfile.py [-h] [--annex ANNEX] [--all] [--verbose] [--tar TAR] [--abs] [path ...] + +Retrieve file from git-annex, even barebone + +positional arguments: + path file to be looked at + +options: + -h, --help show this help message and exit + --annex ANNEX path to annex repository + --all list all + --verbose verbose dump + --tar TAR produces a tar file with given path cotaining the symbolic links + --abs makes abs files + +``` + +## License +This readme, abuilds and support scripts are licensed under MIT License. diff --git a/annexgetfile.py b/annexgetfile.py new file mode 100644 index 0000000..b12bcb3 --- /dev/null +++ b/annexgetfile.py @@ -0,0 +1,210 @@ +#! /usr/bin/env python3 + +# +# Extract files from Bare git-annex repositories without git-annex +# Supports version v6 +# +# See internals: http://git-annex.branchable.com/internals/ +# +# Modified: added non-bare repos, added tar file (of symlinks) output for use with archivemount +# +# TODO: improve output +# TODO: use cat-files instead of archive +# TODO: export to tar WITH relative links +# +# Emanuele Ruffaldi (C) 2016 +import sys,argparse,os,subprocess +import tarfile,hashlib,struct +import tarfile,io,hashlib,struct +from hashlib import md5 + +def gitgetpathinfo(branch,path,recurse=False): + """uses ls-tree to extract information about a path in the branch or in general tree-ish""" + if recurse: + r = "-r" + else: + r = "" + w = subprocess.check_output(["git", "ls-tree",r,branch,"--",path]) + w = w.decode("utf-8") + return [pa.split("\t") for pa in w.split("\n") if pa != ""] # meta TAB filename ==> meta is: ?? SPACE type + +def tarextraclink(content): + """extracts the path of a link in a Tar expressed by content""" + t = tarfile.open(mode="r",fileobj=io.BytesIO(content)) + ti = t.getmembers()[0] + return ti.linkname + +def gitgetfile(branch,path): + """uses archive for extracing the path. This is better than the git show solution because it deals with diff automatically. But does not work with symbolic links""" + xpath,n = os.path.split(path) + xx = "git archive --format=tar --prefix= \"%s:%s\" \"%s\" | tar -xO \"%s\"" % (branch,xpath,n,n) + return subprocess.check_output(xx,shell=True) + +def gitgetfile_tar(branch,path): + """returns the content of a file in tar format""" + try: + xpath,n = os.path.split(path) + xx = "git archive --format=tar --prefix= \"%s:%s\" \"%s\"" % (branch,xpath,n) + return(subprocess.check_output(xx,shell="True")) + except: + return None + +def gitgetfile_show(branch,path): + """retrieve path content: first getting the hash and then the content via git show""" + found = gitgetpathinfo(branch,path) + if len(found) == 1: + return subprocess.check_output(["git", "show",found[0][0].split(" ")[2]]) + else: + return None + +def annexgetremotes(useshow): + """list of remotes AKA uuid.log""" + if useshow: + return gitgetfile_show("git-annex","uuid.log") + else: # slow with bare + return gitgetfile("git-annex","uuid.log") + +#https://gist.github.com/giomasce/a7802bda1417521c5b30 +def hashdirlower(key): + hasher = hashlib.md5() + hasher.update(key.encode("utf-8")) + digest = hasher.hexdigest() + return "%s/%s/" % (digest[:3], digest[3:6]) + +#https://gist.github.com/giomasce/a7802bda1417521c5b30 +def hashdirmixed(key): + hasher = hashlib.md5() + hasher.update(key) + digest = hasher.digest() + first_word = struct.unpack('> (6 * x) & 31 for x in range(4)] + letters = ["0123456789zqjxkmvwgpfZQJXKMVWGPF"[i] for i in nums] + return "%s%s/%s%s/" % (letters[1], letters[0], letters[3], letters[2]) + +def annexwhereis_bare(key): + """returns the location of the key object of git-annex""" + #hashdirlower is used for bare git repositories, the git-annex branch, and on special remotes as well. + #m = md5.new() + #m.update(key) + #h = m.hexdigest() + #pre = h[0:3] + #post = h[3:6] + #print key,pre,post + papa = hashdirlower(key) + return gitgetfile("git-annex",os.path.join(papa,key+".log")),os.path.join("annex","objects",papa,key,key) + +def annexwhereis(key): + """returns the location of the key object of git-annex""" + #non bare uses hashdirmixed + #It takes the md5sum of the key, but rather than a string, represents it as 4 32bit words. Only the first word is used. It is converted into a string by the same mechanism that would be used to encode a normal md5sum value into a string, but where that would normally encode the bits using the 16 characters 0-9a-f, this instead uses the 32 characters "0123456789zqjxkmvwgpfZQJXKMVWGPF". The first 2 letters of the resulting string are the first directory, and the second 2 are the second directory. + papaM = hashdirmixed(key) + papaL = hashdirlower(key) + return gitgetfile("git-annex",os.path.join(papaL,key+".log")),os.path.join("annex","objects",papaM,key,key) + +def checkbare(args): + """checks if the repo is a bare""" + gitdir = os.path.join(args.annex,".git") + if os.path.isdir(gitdir): + if not os.path.isdir(os.path.join(gitdir,"annex")): + return None + else: + return False,gitdir + elif os.path.isdir(os.path.join(args.annex,"annex")): + gitdir = args.annex + return True,gitdir + else: + return None + +def main(): + + parser = argparse.ArgumentParser(description='Retrieve file from git-annex, even barebone') + parser.add_argument('--annex', help="path to annex repository",default=".") + parser.add_argument('path', help="file to be looked at",nargs="*") + parser.add_argument('--all', help="list all",action="store_true") + parser.add_argument('--verbose', help="verbose dump",action="store_true") + parser.add_argument('--tar', help="produces a tar file with given path cotaining the symbolic links") + parser.add_argument('--abs',help="makes abs files",action="store_true") + + args = parser.parse_args() + + # check if bare repository + isbare = checkbare(args) + if isbare is None: + print("not a git-annex repisitory") + isbare,gitdir = isbare + print("isbare?",isbare,gitdir) + + if not isbare: + workdir = args.annex + else: + workdir = None + + os.environ["GIT_DIR"] = gitdir + print("list annexes\n",annexgetremotes(useshow=False)) + + if args.tar: + ot = tarfile.open(args.tar,"w") + + if args.all: + args.path = [x[1] for x in gitgetpathinfo("edge",".",recurse=True)] + + for p in args.path: + # we cannot use + ww = gitgetfile_tar("edge",p) # tarred 1 file + if ww is None: + print("not found",p) + continue + link = tarextraclink(ww) # extract the link from the single file + if args.verbose: + print("aslink",link) + #w = gitgetfile("edge",p) -- not working using tar because it is a link + #ref = gitgetfile_show("edge",p) -- not working in theory + ref = link + if ref == "": + print("not found",p) + else: + key = os.path.split(ref)[1] # the link contains the annex key + if args.verbose: + print("key is",key) + if isbare: + locations,path = annexwhereis_bare(key) # extract + else: + locations,path = annexwhereis(key) + path = os.path.join(gitdir,path) + + if args.verbose: + print(p,"located in\n",locations) + if not os.path.isfile(path): + if not isbare: + if os.path.isfile(path+".map"): + mpath = os.path.join(workdir,open(path+".map","r").read().strip()) + if os.path.isfile(mpath): + path = mpath + else: + print("mapped file not found",mpath," for ",path) # or direct mode not supported + path = None + else: + print("non bare file not found",path) # or direct mode not supported + path = None + else: + print("file not found",path) # or direct mode not supported + path = None + if path is not None: + ss = os.stat(path) + if args.verbose: + print(path,ss) + ti = tarfile.TarInfo(p) + ti.size = 0 # zero for links: ss.st_size + ti.mode = ss.st_mode + ti.mtime = ss.st_mtime + ti.type = tarfile.SYMTYPE + ti.uid = ss.st_uid + ti.gid = ss.st_gid + if args.abs: + ti.linkname = os.path.abspath(path) + else: + ti.linkname = path + ot.addfile(ti) + +if __name__ == '__main__': + main()