Merge branch 'master' of ssh://git-annex.branchable.com

This commit is contained in:
Joey Hess 2022-12-05 13:48:40 -04:00
commit 6eb0943d95
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
9 changed files with 386 additions and 0 deletions

View file

@ -0,0 +1,164 @@
This is a patch that seems to work for my personal use.
BLAKE3 does support variable lengths, but my code does not implement support for anything other than 256-bit (32-byte) digests.
I'm not familiar enough with the codebase to be sure whether adding variable length support later is a backwards compatibility hazard or not.
[[!format patch """
From efa115d94d1a5a52574d5760c6e951ed3c518667 Mon Sep 17 00:00:00 2001
From: edef <edef@edef.eu>
Date: Fri, 2 Dec 2022 12:16:44 +0000
Subject: [PATCH] support BLAKE3
This uses the blake3 package from Hackage, since cryptonite does not
have BLAKE3 support yet.
diff --git a/Backend/Hash.hs b/Backend/Hash.hs
index 550d8fc6c..809a82599 100644
--- a/Backend/Hash.hs
+++ b/Backend/Hash.hs
@@ -27,8 +27,11 @@ import qualified Data.ByteString as S
import qualified Data.ByteString.Short as S (toShort, fromShort)
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
+import Data.IORef
+import Control.Arrow
import Control.DeepSeq
import Control.Exception (evaluate)
+import qualified BLAKE3
data Hash
= MD5Hash
@@ -40,6 +43,7 @@ data Hash
| Blake2bpHash HashSize
| Blake2sHash HashSize
| Blake2spHash HashSize
+ | Blake3Hash
cryptographicallySecure :: Hash -> Bool
cryptographicallySecure (SHA2Hash _) = True
@@ -49,6 +53,7 @@ cryptographicallySecure (Blake2bHash _) = True
cryptographicallySecure (Blake2bpHash _) = True
cryptographicallySecure (Blake2sHash _) = True
cryptographicallySecure (Blake2spHash _) = True
+cryptographicallySecure Blake3Hash = True
cryptographicallySecure SHA1Hash = False
cryptographicallySecure MD5Hash = False
@@ -63,6 +68,7 @@ hashes = concat
, map (Blake2bpHash . HashSize) [512]
, map (Blake2sHash . HashSize) [256, 160, 224]
, map (Blake2spHash . HashSize) [256, 224]
+ , [Blake3Hash]
, [SHA1Hash]
, [MD5Hash]
]
@@ -99,6 +105,7 @@ hashKeyVariety (Blake2bHash size) he = Blake2bKey size he
hashKeyVariety (Blake2bpHash size) he = Blake2bpKey size he
hashKeyVariety (Blake2sHash size) he = Blake2sKey size he
hashKeyVariety (Blake2spHash size) he = Blake2spKey size he
+hashKeyVariety Blake3Hash he = Blake3Key he
{- A key is a hash of its contents. -}
keyValue :: Hash -> KeySource -> MeterUpdate -> Annex Key
@@ -219,6 +226,7 @@ hasher (Blake2bHash hashsize) = blake2bHasher hashsize
hasher (Blake2bpHash hashsize) = blake2bpHasher hashsize
hasher (Blake2sHash hashsize) = blake2sHasher hashsize
hasher (Blake2spHash hashsize) = blake2spHasher hashsize
+hasher Blake3Hash = blake3Hasher
mkHasher :: HashAlgorithm h => (L.ByteString -> Digest h) -> Context h -> Hasher
mkHasher h c = (show . h, mkIncrementalVerifier c descChecksum . sameCheckSum)
@@ -272,6 +280,27 @@ blake2spHasher (HashSize hashsize)
| hashsize == 224 = mkHasher blake2sp_224 blake2sp_224_context
| otherwise = error $ "unsupported BLAKE2SP size " ++ show hashsize
+blake3Hasher :: Hasher
+blake3Hasher = (hash, incremental) where
+ finalize :: BLAKE3.Hasher -> BLAKE3.Digest BLAKE3.DEFAULT_DIGEST_LEN
+ finalize = BLAKE3.finalize
+
+ hash :: L.ByteString -> String
+ hash = show . finalize . L.foldlChunks ((. pure) . BLAKE3.update) BLAKE3.hasher
+
+ incremental :: Key -> IO IncrementalVerifier
+ incremental k = do
+ v <- newIORef (Just (BLAKE3.hasher, 0))
+ return $ IncrementalVerifier
+ { updateIncrementalVerifier = \b ->
+ modifyIORef' v . fmap $ flip BLAKE3.update [b] *** (fromIntegral (S.length b) +)
+ , finalizeIncrementalVerifier =
+ fmap (sameCheckSum k . show . finalize . fst) <$> readIORef v
+ , unableIncrementalVerifier = writeIORef v Nothing
+ , positionIncrementalVerifier = fmap snd <$> readIORef v
+ , descIncrementalVerifier = descChecksum
+ }
+
sha1Hasher :: Hasher
sha1Hasher = mkHasher sha1 sha1_context
diff --git a/Types/Key.hs b/Types/Key.hs
index 271723982..ea71f85ed 100644
--- a/Types/Key.hs
+++ b/Types/Key.hs
@@ -214,6 +214,7 @@ data KeyVariety
| Blake2bpKey HashSize HasExt
| Blake2sKey HashSize HasExt
| Blake2spKey HashSize HasExt
+ | Blake3Key HasExt
| SHA1Key HasExt
| MD5Key HasExt
| WORMKey
@@ -247,6 +248,7 @@ hasExt (Blake2bKey _ (HasExt b)) = b
hasExt (Blake2bpKey _ (HasExt b)) = b
hasExt (Blake2sKey _ (HasExt b)) = b
hasExt (Blake2spKey _ (HasExt b)) = b
+hasExt (Blake3Key (HasExt b)) = b
hasExt (SHA1Key (HasExt b)) = b
hasExt (MD5Key (HasExt b)) = b
hasExt WORMKey = False
@@ -262,6 +264,7 @@ sameExceptExt (Blake2bKey sz1 _) (Blake2bKey sz2 _) = sz1 == sz2
sameExceptExt (Blake2bpKey sz1 _) (Blake2bpKey sz2 _) = sz1 == sz2
sameExceptExt (Blake2sKey sz1 _) (Blake2sKey sz2 _) = sz1 == sz2
sameExceptExt (Blake2spKey sz1 _) (Blake2spKey sz2 _) = sz1 == sz2
+sameExceptExt (Blake3Key _) (Blake3Key _) = True
sameExceptExt (SHA1Key _) (SHA1Key _) = True
sameExceptExt (MD5Key _) (MD5Key _) = True
sameExceptExt _ _ = False
@@ -275,6 +278,7 @@ formatKeyVariety v = case v of
Blake2bpKey sz e -> adde e (addsz sz "BLAKE2BP")
Blake2sKey sz e -> adde e (addsz sz "BLAKE2S")
Blake2spKey sz e -> adde e (addsz sz "BLAKE2SP")
+ Blake3Key e -> adde e "BLAKE3"
SHA1Key e -> adde e "SHA1"
MD5Key e -> adde e "MD5"
WORMKey -> "WORM"
@@ -337,6 +341,8 @@ parseKeyVariety "BLAKE2SP224" = Blake2spKey (HashSize 224) (HasExt False)
parseKeyVariety "BLAKE2SP224E" = Blake2spKey (HashSize 224) (HasExt True)
parseKeyVariety "BLAKE2SP256" = Blake2spKey (HashSize 256) (HasExt False)
parseKeyVariety "BLAKE2SP256E" = Blake2spKey (HashSize 256) (HasExt True)
+parseKeyVariety "BLAKE3" = Blake3Key (HasExt False)
+parseKeyVariety "BLAKE3E" = Blake3Key (HasExt True)
parseKeyVariety "SHA1" = SHA1Key (HasExt False)
parseKeyVariety "SHA1E" = SHA1Key (HasExt True)
parseKeyVariety "MD5" = MD5Key (HasExt False)
diff --git a/git-annex.cabal b/git-annex.cabal
index cd58a4ca3..7c251e33b 100644
--- a/git-annex.cabal
+++ b/git-annex.cabal
@@ -362,6 +362,7 @@ Executable git-annex
securemem,
crypto-api,
cryptonite (>= 0.23),
+ blake3,
memory,
deepseq,
split,
diff --git a/stack.yaml b/stack.yaml
index 7dbfb657a..936ee841b 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -25,3 +25,4 @@ extra-deps:
- base64-bytestring-1.0.0.3
- bencode-0.6.1.1
- http-client-0.7.9
+- blake3-0.2@sha256:d1146b9a51ccfbb0532780778b6d016a614e3d44c05d8c1923dde9a8be869045,2448
"""]]

View file

@ -0,0 +1,52 @@
### Please describe the problem.
I ran git annex fsck --json. I have some files that are lacking numcopies. I get output in stderr, but no error details in the json body.
For example:
{"command":"fsck","success":false,"key":"SHA256E-s165540--ddcf7ce58593667e1b836e2a7f28a9f5227f3d9ba46cf8f98c7ab9dd26ef1896.jpg","error-messages":[],"file":"2022/12/04/chandrian_10:06:41.jpg","dead":[],"untrusted":[],"input":["2022/12/04/chandrian_10:06:41.jpg"]}
Only 2 of 5 trustworthy copies exist of 2022/12/04/chandrian_11:05:20.jpg
Back it up with git-annex copy.
It would be great if error-messages contained all the details for failures. Thank you
### What steps will reproduce the problem?
* create a new repo
* annex a file
* set numcopies to 2 or whatever
* git annex fsck --json
### What version of git-annex are you using? On what operating system?
I'm on Fedora 37.
git annex version
git-annex version: 10.20221103
build flags: Assistant Webapp Pairing Inotify DBus DesktopNotify TorrentParser MagicMime Feeds Testsuite S3 WebDAV
dependency versions: aws-0.22 bloomfilter-2.0.1.0 cryptonite-0.29 DAV-1.3.4 feed-1.3.2.0 ghc-8.10.7 http-client-0.6.4.1 persistent-sqlite-2.13.1.0 torrent-10000.1.1 uuid-1.3.15 yesod-1.6.2
key/value backends: SHA256E SHA256 SHA512E SHA512 SHA224E SHA224 SHA384E SHA384 SHA3_256E SHA3_256 SHA3_512E SHA3_512 SHA3_224E SHA3_224 SHA3_384E SHA3_384 SKEIN256E SKEIN256 SKEIN512E SKEIN512 BLAKE2B256E BLAKE2B256 BLAKE2B512E BLAKE2B512 BLAKE2B160E BLAKE2B160 BLAKE2B224E BLAKE2B224 BLAKE2B384E BLAKE2B384 BLAKE2BP512E BLAKE2BP512 BLAKE2S256E BLAKE2S256 BLAKE2S160E BLAKE2S160 BLAKE2S224E BLAKE2S224 BLAKE2SP256E BLAKE2SP256 BLAKE2SP224E BLAKE2SP224 SHA1E SHA1 MD5E MD5 WORM URL X*
remote types: git gcrypt p2p S3 bup directory rsync web bittorrent webdav adb tahoe glacier ddar git-lfs httpalso borg hook external
operating system: linux x86_64
supported repository versions: 8 9 10
upgrade supported from repository versions: 0 1 2 3 4 5 6 7 8 9 10
local repository version: 10
### Please provide any additional information below.
[[!format sh """
# If you can, paste a complete transcript of the problem occurring here.
# If the problem is with the git-annex assistant, paste in .git/annex/daemon.log
# End of transcript or log.
"""]]
### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders)
Yes! Git annex is amazing and is managing over 10 TB of data across 5 git annexes and around 10 hard drives. No data loss this entire time -- over 8 years.

View file

@ -0,0 +1,17 @@
[[!comment format=mdwn
username="kanak@3c4f6e7d832d88751c617b25bdbac896417eb93b"
nickname="kanak"
avatar="http://cdn.libravatar.org/avatar/708121dfec06e554300b2a3a73a26818"
subject="comment 1"
date="2022-12-04T17:40:38Z"
content="""
Not just limited to numcopies:
{\"command\":\"fsck\",\"success\":true,\"key\":\"SHA256E-s119046--239da5a85ddf8c4071d8803a864a896d13e2a2fd65fd5684fc2f6dcaf264e875.jpg\",\"error-messages\":[],\"file\":\"12/03/chandrian_22:37:41.jpg\",\"note\":\"checksum...\",\"input
\":[\"12/03/chandrian_22:37:41.jpg\"]}
** Based on the location log, 12/03/chandrian_23:05:41.jpg
** was expected to be present, but its content is missing.
"""]]

View file

@ -0,0 +1,10 @@
[[!comment format=mdwn
username="yarikoptic"
avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4"
subject="comment 3"
date="2022-11-29T22:25:42Z"
content="""
> I have re-ran the command to see if the bug replicates..
note: it was considerable amount of time (days?) for it to take there ;) I made a copy `test_fs_testonly.py` where I removed all other \"benchmarks\" prior running `annex test` -- might get there faster if works so feel free to interrupt and rerun that one. That code is old (circa 2014 ;)) , and I should do some face lift but haven't had a chance yet :-/
"""]]

View file

@ -0,0 +1,26 @@
Hey everyone,
I made a Python script that launches your `$EDITOR` (or `$VISUAL`) to conveniently edit git-annex metadata.
Code is [on Gitlab](https://gitlab.com/nobodyinperson/git-annex-metadata-edit).
## 📥 Installation
```bash
# Installation
pip install git+https://gitlab.com/nobodyinperson/git-annex-metadata-edit
```
## ✨ Features
- Operate on multiple files (recursively)
- overwrite/remove a metadata field
- add/remove specific values from a field in one go
## 📟 📹 Screencast
[![asciicast](https://asciinema.org/a/541576.svg)](https://asciinema.org/a/541576?autoplay=1)
Cheers, 👍
Yann

View file

@ -0,0 +1,8 @@
Hi,
Is there a way to make a "view" that only shows me files that are locally existing (meaning: where the binary is present in the repo I am working on)?
I had a look at documentation and the forum but I did not find anything fitting (I honestly assume that I overlooked it somewhere). To me views, vfilter and such do not appear to meet my needs since they are only working on metadata.
Why do I need this?
I am starting to manage my music collection with git annex. On most devices I only have a fraction of the globally available collection. I would like to see only those symlinks that actually lead to binary files. This way I would not to confuse music players with broken symlinks and know at a glance what I can actually listen to right now.

View file

@ -0,0 +1,8 @@
[[!comment format=mdwn
username="Lukey"
avatar="http://cdn.libravatar.org/avatar/c7c08e2efd29c692cc017c4a4ca3406b"
subject="comment 1"
date="2022-12-01T20:49:35Z"
content="""
`git annex adjust --hide-missing`
"""]]

View file

@ -0,0 +1,8 @@
[[!comment format=mdwn
username="agschaid"
avatar="http://cdn.libravatar.org/avatar/7789d7511c5da25d71021be4ddb7fe18"
subject="comment 2"
date="2022-12-01T21:12:11Z"
content="""
Thank you! That is simply perfect.
"""]]

View file

@ -0,0 +1,93 @@
Thank you for `git-annex`, it's awesome!
I recently figured I could add `git-annex metadata` to my research data files that contains the start and end date of timeseries data inside the files so a quick lookup by date range (”which files contain data in that time range”) is possible.
This is possible when using numeric timestamps (e.g. unix timestamp like `1669981463`) but not with stringy dates (e.g. `2022-11-12T20:10:14+0200`) as `--metadata fieldname>=VALUE` does _numeric_ comparison.
## Proposal: How about when `--metadata fieldname>=VALUE` falls back to string comparison when `VALUE` can't be parsed as a number?
## Test case
Consider this script `make-git-annex-dir-with-timestamps.sh`:
```sh
#/bin/sh
fmt="$1";test -n "$fmt" || fmt="%FT%T%z"
# make a new git annex repository
d=git-annex-with-times-"$fmt";chmod +w -R "$d";rm -rf "$d";mkdir "$d";cd "$d"
git init
git annex init
# create some files
for i in `seq 1 9`;do echo "File $i" > "file$i";done
git annex add .
git commit -m "Add files"
# add metadata to files
for i in `seq 1 9`;do
time_start="$(date -d"$((-20 + $i)) hours" +"$fmt")"
(set -x;git annex metadata --set time-start="$time_start" "file$i")
time_end="$(date -d"$((-10 + $i)) hours" +"$fmt")"
(set -x;git annex metadata --set time-end="$time_end" "file$i")
done
timerange_start="$(date -d "-16 hours -5 minutes" +"$fmt")"
timerange_end="$(date -d "-12 hours +5 minutes" +"$fmt")"
(
set -x
git annex find \
"-(" --metadata "time-start>=$timerange_start" --and --metadata "time-start<=$timerange_end" "-)" \
--or \
"-(" --metadata "time-end>=$timerange_start" --and --metadata "time-end<=$timerange_end" "-)"
)
echo "⬆⬆⬆ This should only output file4 through file8 ⬆⬆⬆"
```
Invoked with unix timestamps time format, it works as expected:
```sh
> ./make-git-annex-dir-with-timestamps.sh '%s'
# ...
+ git annex find '-(' --metadata 'time-start>=1669923315' --and --metadata 'time-start<=1669938315' '-)' --or '-(' --metadata 'time-end>=1669923315' --and --metadata 'time-end<=1669938315' '-)'
file4
file5
file6
file7
file8
⬆⬆⬆ This should only output file4 through file8 ⬆⬆⬆
```
However, other stringy date formats match all files:
```bash
# typical ISO-ish time format
> ./make-git-annex-dir-with-timestamps.sh "%FT%T%z"
# ...
+ git annex find '-(' --metadata 'time-start>=2022-12-01T20:49:37+0100' --and --metadata 'time-start<=2022-12-02T00:59:37+0100' '-)' --or '-(' --metadata 'time-end>=2022-12-01T20:49:37+0100' --and --metadata 'time-end<=2022-12-02T00:59:37+0100' '-)'
file1
file2
file3
file4
file5
file6
file7
file8
file9
⬆⬆⬆ This should only output file4 through file8 ⬆⬆⬆
```
```sh
# git-annex's own time format for 'FIELDNAME-lastchanged'
> ./make-git-annex-dir-with-timestamps.sh "%Y-%m-%d@%H-%M-%S"
# ...
+ git annex find '-(' --metadata 'time-start>=2022-12-01@20-38-04' --and --metadata 'time-start<=2022-12-02@00-38-04' '-)' --or '-(' --metadata 'time-end>=2022-12-01@20-38-04' --and --metadata 'time-end<=2022-12-02@00-38-04' '-)'
file1
file2
file3
file4
file5
file6
file7
file8
file9
⬆⬆⬆ This should only output file4 through file8 ⬆⬆⬆
```
Yann / @nobodyinperson