From b96ff828718ae10b7b28c1b807c8d7027c3ef669 Mon Sep 17 00:00:00 2001 From: yarikoptic Date: Tue, 11 Jun 2024 17:36:51 +0000 Subject: [PATCH 1/9] Added a comment --- ..._5dfa78ee6436020596f4b2efe678f05b._comment | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_2_5dfa78ee6436020596f4b2efe678f05b._comment diff --git a/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_2_5dfa78ee6436020596f4b2efe678f05b._comment b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_2_5dfa78ee6436020596f4b2efe678f05b._comment new file mode 100644 index 0000000000..1801ec2d2a --- /dev/null +++ b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_2_5dfa78ee6436020596f4b2efe678f05b._comment @@ -0,0 +1,88 @@ +[[!comment format=mdwn + username="yarikoptic" + avatar="http://cdn.libravatar.org/avatar/f11e9c84cb18d26a1748c33b48c924b4" + subject="comment 2" + date="2024-06-11T17:36:51Z" + content=""" +interestingly on the client `git restore --staged PATH` managed to recover the link to become \"proper\". And `git-annex restage` did nothing to fix situation with `Modified` file: + +``` +[bids@rolando VIDS] > git merge --ff-only synced/master +Updating b4f3af57..263dad67 +Updating files: 100% (871/871), done. +Fast-forward + .gitattributes | 1 + + .gitignore +... +create mode 100644 logs/2024-05-24T07:35-04:00.log + create mode 100644 logs/2024-05-24T07:35-04:00.logpwd + + + +git-annex: git status will show Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log to be modified, since content availability has changed and git-annex was unable to update the index. This is only a cosmetic problem affecting git status; git add, git commit, etc won't be affected. To fix the git status display, you can run: git-annex restage +[bids@rolando VIDS] > +[bids@rolando VIDS] > +[bids@rolando VIDS] > +[bids@rolando VIDS] > git-annex restage +restage ok +[bids@rolando VIDS] > git status +On branch master +Changes to be committed: + (use \"git restore --staged ...\" to unstage) + modified: Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log + +[bids@rolando VIDS] > git-annex restage +restage ok +[bids@rolando VIDS] > git status +On branch master +Changes to be committed: + (use \"git restore --staged ...\" to unstage) + modified: Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log + +[bids@rolando VIDS] > git-annex restage Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log +git-annex: This command takes no parameters. +[bids@rolando VIDS] > git status +On branch master +Changes to be committed: + (use \"git restore --staged ...\" to unstage) + modified: Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log + +[bids@rolando VIDS] > git restore --staged Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log +[bids@rolando VIDS] > git status +On branch master +Changes not staged for commit: + (use \"git add ...\" to update what will be committed) + (use \"git restore ...\" to discard changes in working directory) + modified: Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log + +no changes added to commit (use \"git add\" and/or \"git commit -a\") +[bids@rolando VIDS] > git diff +diff --git a/Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log b/Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log +index 92b79020..fc930f54 100644 +--- a/Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log ++++ b/Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log +@@ -1 +1 @@ +-/annex/objects/MD5E-s69--08983cc11522233e5d4815e4ef62275a.mkv.log ++/annex/objects/MD5E-s68799--29541299bea3691f430d855d2fb432fb.mkv.log +diff --git a/Videos/2024/04/2024.04.04.06.01.22.647_.mkv.log b/Videos/2024/04/2024.04.04.06.01.22.647_.mkv.log +--- a/Videos/2024/04/2024.04.04.06.01.22.647_.mkv.log ++++ b/Videos/2024/04/2024.04.04.06.01.22.647_.mkv.log +@@ -1 +0,0 @@ +-/annex/objects/MD5E-s0--d41d8cd98f00b204e9800998ecf8427e.mkv.log +[bids@rolando VIDS] > git log Videos/2024/03/2024.03.17.14.09.12.550_2024.03.17.14.09.18.818.mkv.log +commit ef5549f74dfea19c11bf963a7ec9789bce0d925d +Author: ReproStim User +Date: Wed Apr 17 09:38:23 2024 -0400 + + Move files under subfolders + +``` + + +``` +[bids@rolando VIDS] > git --version +git version 2.39.2 +[bids@rolando VIDS] > git annex version --raw +10.20231129+git83-g86dbe9a825-1~ndall+1 +``` +"""]] From c6f2a5d372ab80d4c0f25f6722f9dc6c89515901 Mon Sep 17 00:00:00 2001 From: yarikoptic Date: Wed, 12 Jun 2024 13:20:29 +0000 Subject: [PATCH 2/9] TODO for log --key --- doc/todo/add_--key_to___34__annex_log__34__.mdwn | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 doc/todo/add_--key_to___34__annex_log__34__.mdwn diff --git a/doc/todo/add_--key_to___34__annex_log__34__.mdwn b/doc/todo/add_--key_to___34__annex_log__34__.mdwn new file mode 100644 index 0000000000..fdeb6062b6 --- /dev/null +++ b/doc/todo/add_--key_to___34__annex_log__34__.mdwn @@ -0,0 +1,15 @@ +``` +NAME + git-annex-log - shows location log information + +SYNOPSIS + git annex log [path ...] + +``` + +although quite often desired to check by the key which might not even be in the tree. `whereis` ( a sister command for similar investigations ) has `--key`, so I thought it would be great to get it here too. + +In my case -- doing archaeology on AFNI's test data in [https://github.com/afni/afni/pull/656](https://github.com/afni/afni/pull/656). + +[[!meta author=yoh]] +[[!tag projects/repronim]] From c855b50f042377ff4eb3286d69c82a2fedeff495 Mon Sep 17 00:00:00 2001 From: "m.risse@77eac2c22d673d5f10305c0bade738ad74055f92" Date: Wed, 12 Jun 2024 15:42:42 +0000 Subject: [PATCH 3/9] --- ...erification_failure_on_first_download.mdwn | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 doc/bugs/VURL_verification_failure_on_first_download.mdwn diff --git a/doc/bugs/VURL_verification_failure_on_first_download.mdwn b/doc/bugs/VURL_verification_failure_on_first_download.mdwn new file mode 100644 index 0000000000..7cadb61389 --- /dev/null +++ b/doc/bugs/VURL_verification_failure_on_first_download.mdwn @@ -0,0 +1,93 @@ +### Please describe the problem. + +With an external special remote that handles a custom URL scheme, I receive a "Verification of content failed" on the first `git annex get` of a file (i.e. when git-annex cannot know a checksum for the file, yet). + +Sorry that this is hidden in a bit of indirection in a datalad extension, what it does is effectively just implement an external special remote that handles `cds:` URLs and then `git annex addurl --fast --verifiable` those URLs. I get the same verification error even with `--relaxed` instead of `--fast` (though I would like to have the semantics of `--fast`, i.e. record checksum on first download and then always check against that). + +### What steps will reproduce the problem? + +Install datalad, and datalad-cds from this PR: . Then: +[[!format sh """ +datalad create test-ds +cd test-ds/ +datalad download-cds --lazy --path download.grib '{ + "dataset": "reanalysis-era5-pressure-levels", + "sub-selection": { + "variable": "temperature", + "pressure_level": "1000", + "product_type": "reanalysis", + "date": "2017-12-01/2017-12-31", + "time": "12:00", + "format": "grib" + } +}' +git annex get download.grib +"""]] + + +### What version of git-annex are you using? On what operating system? + +``` +git-annex version: 10.20240430 +build flags: Assistant Webapp Pairing Inotify DBus DesktopNotify TorrentParser MagicMime Feeds Testsuite S3 WebDAV +dependency versions: aws-0.24.1 bloomfilter-2.0.1.2 crypton-0.34 DAV-1.3.4 feed-1.3.2.1 ghc-9.6.5 http-client-0.7.17 persistent-sqlite-2.13.3.0 torrent-10000.1.3 uuid-1.3.15 yesod-1.6.2.1 +key/value backends: SHA256E SHA256 SHA512E SHA512 SHA224E SHA224 SHA384E SHA384 SHA3_256E SHA3_256 SHA3_512E SHA3_512 SHA3_224E SHA3_224 SHA3_384E SHA3_384 SKEIN256E SKEIN256 SKEIN512E SKEIN512 BLAKE2B256E BLAKE2B256 BLAKE2B512E BLAKE2B512 BLAKE2B160E BLAKE2B160 BLAKE2B224E BLAKE2B224 BLAKE2B384E BLAKE2B384 BLAKE2BP512E BLAKE2BP512 BLAKE2S256E BLAKE2S256 BLAKE2S160E BLAKE2S160 BLAKE2S224E BLAKE2S224 BLAKE2SP256E BLAKE2SP256 BLAKE2SP224E BLAKE2SP224 SHA1E SHA1 MD5E MD5 WORM URL VURL X* +remote types: git gcrypt p2p S3 bup directory rsync web bittorrent webdav adb tahoe glacier ddar git-lfs httpalso borg rclone hook external +operating system: linux x86_64 +supported repository versions: 8 9 10 +upgrade supported from repository versions: 0 1 2 3 4 5 6 7 8 9 10 +``` + +on Ubuntu, installed from a recent version of nixpkgs. Also happens in CI (see PR in datalad-cds) where git-annex is installed from NeuroDebian. + + +### Please provide any additional information below. + +[[!format sh """ +# If you can, paste a complete transcript of the problem occurring here. +# If the problem is with the git-annex assistant, paste in .git/annex/daemon.log + +$ datalad create test-ds +create(ok): <...> (dataset) +$ cd test-ds/ +$ datalad download-cds --lazy --path download.grib '{ + "dataset": "reanalysis-era5-pressure-levels", + "sub-selection": { + "variable": "temperature", + "pressure_level": "1000", + "product_type": "reanalysis", + "date": "2017-12-01/2017-12-31", + "time": "12:00", + "format": "grib" + } +}' +save(ok): . (dataset) +cds(ok): <...> (dataset) +$ git annex info download.grib +file: download.grib +size: 0 bytes (+ 1 unknown size) +key: VURL--cds:v1-eyJkYXRhc2V0IjoicmVhbmFs-77566133ebfe9220aefbeed5a58b6972 +present: false +$ git annex get download.grib +get download.grib (from cds...) + + CDS request is submitted + + CDS request is completed + + Starting download from CDS +(checksum...) + Verification of content failed + + Unable to access these remotes: cds + + No other repository is known to contain the file. +failed +get: 1 failed + +# End of transcript or log. +"""]] + +### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders) + + From 22a329c57e721d80a4461d6e5488d6e0af22d1eb Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 13 Jun 2024 06:43:59 -0400 Subject: [PATCH 4/9] copied over some changes from proxy branch --- doc/design/passthrough_proxy.mdwn | 209 ++++++++++++++++++++++++------ doc/todo/git-annex_proxies.mdwn | 63 ++++----- 2 files changed, 198 insertions(+), 74 deletions(-) diff --git a/doc/design/passthrough_proxy.mdwn b/doc/design/passthrough_proxy.mdwn index 9742cb3686..4b86037471 100644 --- a/doc/design/passthrough_proxy.mdwn +++ b/doc/design/passthrough_proxy.mdwn @@ -15,7 +15,7 @@ existing remotes to keep up with changes are made on the server side. A proxy would avoid this complexity. It also allows limiting network ingress to a single point. -Ideally a proxy would look like any other git-annex remote. All the files +A proxy can be the frontend to a cluster. All the files stored anywhere in the cluster would be available to retrieve from the proxy. When a file is sent to the proxy, it would store it somewhere in the cluster. @@ -108,55 +108,169 @@ The only real difference seems to be that the UUID of a remote is cached, so A could only do this the first time we accessed it, and not later. With UUID discovery, A can do that at any time. -## user interface +## proxied remote names What to name the instantiated remotes? Probably the best that could be done is to use the proxy's own remote names as suffixes on the client. Eg, the proxy's "node1" remote is "proxy-node1". -But the user probably doesn't want to pick which node to send content to. -They don't necessarily know anything about the nodes. Ideally the user -would `git-annex copy --to proxy` or `git-annex push` and let it pick -which instantiated remote(s) to send to. +But, the user might have their own "proxy-node1" remote configured that +points to something else. To avoid a proxy changing the configuration of +the user's remote to point to its remote, git-annex must avoid +instantiating a proxied remote when there's already a configuration for a +remote with that same name. -To make `git-annex copy --to proxy` work, `storeKey` could be changed to -allow returning a UUID (or UUIDs) where the content was actually stored. -That would also allow a single upload to the proxy to fan out and be stored -in multiple nodes. The proxy would use preferred content to pick which of -its nodes to store on. +That does mean that, if a user wants to set a git config for a proxy +remote, they will need to manually set its annex-uuid and its url. +Which is awkward. Many git configs of the proxy remote can be inherited by +the instantiated remotes, so users won't often need to do that. -Instantiated remotes would still be needed for `git-annex get` and similar -to work. +A user can also set up a remote with another name that they +prefer, that points at a remote behind a proxy. They just need to set +its annex-uuid and its url. Perhaps there should be a git-annex command +that eases setting up a remote like that? -To make `git-annex copy --from proxy` work, the proxy would need to pick -a node and stream content from it. That's doable, but how to handle a case -where a node gets corrupted? The best it could do is mark that node as no -longer containing the content (as if a fsck failed) and try another one -next time. This complication might not be necessary. Consider that -while `git-annex copy --to foo` followed later by `git-annex copy --from foo` -will usually work, it doesn't work when eg first copying to a transfer -remote, which then sends the content elsewhere and drops its copy. +## proxied remotes in git remote list -What about dropping? `git-annex drop --from proxy` could be made to work, -by having `removeKey` return a list of UUIDs that the content was dropped -from. What should that do if it's able to drop from some nodes but not -others? Perhaps it would need to be able to return a list of UUIDs that -content was dropped from but still indicate it overall failed to drop. -(Note that it's entirely possible that dropping from one node of the proxy -involves lockContent on another node of the proxy in order to satisfy -numcopies.) +Should instantiated remotes have enough configured in git so that +`git remote list` will list them? This would make things like tab +completion of proxied remotes work, and would generally let the user +discover that there *are* proxied remotes. + +This could be done by a config like remote.name.annex-proxied = true. +That makes other configs of the remote not prevent it being used as an +instantiated remote. So remote.name.annex-uuid can be changed when +the uuid behind a proxy changes. And it allows updating remote.name.url +to keep it the same as the proxy remote's url. (Or possibly to set it to +something else?) + +Configuring the instantiated remotes like that would let anyone who can +write to the git-annex branch flood other people's repos with configs +for any number of git remotes. Which might be obnoxious. + +## single upload with fanout + +If we want to send a file to multiple repositories that are behind the same +proxy, it would be wasteful to upload it through the proxy repeatedly. + +Perhaps a good user interface to this is `git-annex copy --to proxy`. +The proxy could fan out the upload and store it in one or more nodes behind +it. Using preferred content to select which nodes to use. +This would need `storeKey` to be changed to allow returning a UUID (or UUIDs) +where the content was actually stored. + +Alternatively, `git-annex copy --to proxy-foo` could notice that proxy-bar +also wants the content, and fan out a copy to there. Then it could +record in its git-annex branch that the content is present in proxy-bar. +If the user later does `git-annex copy --to proxy-bar`, it would avoid +another upload (and the user would learn at that point that it was in +proxy-bar). This avoids needing to change the `storeKey` interface. + +Should a proxy always fanout? if `git-annex copy --to proxy` is what does +fanout, and `git-annex copy --to proxy-foo` doesn't, then the user has +content. But if the latter does fanout, that might be annoying to users who +want to use proxies, but want full control over what lands where, and don't +want to use preferred content to do it. So probably fanout should be +configurable. But it can't be configured client side, because the fanout +happens on the proxy. Seems like remote.name.annex-fanout could be set to +false to prevent fanout to a specific remote. (This is analagous to a +remote having `git-annex assistant` running on it, it might fan out uploads +to it to other repos, and only the owner of that repo can control it.) A command like `git-annex push` would see all the instantiated remotes and -would pick one to send content to. Seems like the proxy might choose to -`storeKey` the content on other node(s) than the requested one. Which would -be fine. But, `git-annex push` would still do considerable extra work in -iterating over all the instantiated remotes. So it might be better to make -such commands not operate on instantiated remotes for sending content but -only on the proxy. +would pick ones to send content to. If the proxy does fanout, this would +lead to `git-annex push` doing extra work iterating over instantiated +remotes that have already received content via fanout. Could this extra +work be avoided? -Commands like `git-annex push` and `git-annex pull` -should also skip the instantiated remotes when pushing or pulling the git -repo, because that would be extra work that accomplishes nothing. +## clusters + +One way to use a proxy is just as a convenient way to access a group of +remotes that are behind it. Some remotes may only be reachable by the +proxy, but you still know what the individual remotes are. Eg, one might be +a S3 bucket that can only be written via the proxy, but is globally +readable without going through the proxy. Another might be a drive that is +sometimes located behind the proxy, but other times connected directly. +Using a proxy this way just involves using the instantiated proxied remotes. + +Or a proxy can be the frontend for a cluster. In this situation, the user +doesn't know anything much about the nodes in the cluster, perhaps not even +that they exist, or perhaps what keys are stored on which nodes. + +In the cluster case, the user would like to not need to pick a specific +node to send content to. While they could use preferred content to pick a +node, or nodes, they would prefer to be able to say `git-annex copy --to cluster` +and let it pick which nodes to send to. And similarly, +`git-annex drop --from cluster' should drop the content from every node in +the cluster. + +For this we need a UUID for the cluster. But it is not like a usual UUID. +It does not need to actually be recorded in the location tracking logs, and +it is not counted as a copy for numcopies purposes. The only point of this +UUID is to make commands like `git-annex drop --from cluster` and +`git-annex get --from cluster` talk to the cluster's frontend proxy, which +has as its UUID the cluster's UUID. + +The cluster UUID is recorded in the git-annex branch, along with a list of +the UUIDs of nodes of the cluster (which can change at any time). + +When reading a location log, if any UUID where content is present is part +of the cluster, the cluster's UUID is added to the list of UUIDs. + +When writing a location log, the cluster's UUID is filtered out of the list +of UUIDs. + +The cluster's frontend proxy fans out uploads to nodes according to +preferred content. And `storeKey` is extended to be able to return a list +of additional UUIDs where the content was stored. So an upload to the +cluster will end up writing to the location log the actual nodes that it +was fanned out to. + +Note that to support clusters that are nodes of clusters, when a cluster's +frontend proxy fans out an upload to a node, and `storeKey` returns +additional UUIDs, it should pass those UUIDs along. Of course, no cluster +can be a node of itself, and cycles have to be broken (as described in a +section below). + +When a file is requested from the cluster's frontend proxy, it can send its +own local copy if it has one, but otherwise it will proxy to one of its +nodes. (How to pick which node to use? Load balancing?) This behavior will +need to be added to git-annex-shell, and to Remote.Git for local paths to a +cluster. + +The cluster's frontend proxy also fans out drops to all nodes, attempting +to drop content from the whole cluster, and only indicating success if it +can. Also needs changes to git-annex-sjell and Remote.Git. + +It does not fan out lockcontent, instead the client will lock content +on specific nodes. In fact, the cluster UUID should probably be omitted +when constructing a drop proof, since trying to lockcontent on it will +usually fail. + +Some commands like `git-annex whereis` will list content as being stored in +the cluster, as well as on whicheven of its nodes, and whereis currently +says "n copies", but since the cluster doesn't count as a copy, that +display should probably be counted using the numcopies logic that excludes +cluster UUIDs. + +No other protocol extensions or special cases should be needed. Except for +the strange case of content stored in the cluster's frontend proxy. + +Running `git-annex fsck --fast` on the cluster's frontend proxy will look +weird: For each file, it will read the location log, and if the file is +present on any node it will add the frontend proxy's UUID. So fsck will +expect the content to be present. But it probably won't be. So it will fix +the location log... which will make no changes since the proxy's UUID will +be filtered out on write. So probably fsck will need a special case to +avoid this behavior. (Also for `git-annex fsck --from cluster --fast`) + +And if a key does get stored on the cluster's frontend proxy, it will not +be possible to tell from looking at the location log that the content is +really present there. So that won't be counted as a copy. In some cases, +a cluster's frontend proxy may want to keep files, perhaps some files are +worth caching there for speed. But if a file is stored only on the +cluster's frontend proxy and not in any of its nodes, clients will not +consider the cluster to contain the file at all. ## speed @@ -246,6 +360,23 @@ in front of the proxy. ## cycles +A repo can advertise that it proxies for a repo which has the same uuid as +itself. Or there can be a larger cycle involving a proxy that proxies to a +proxy, etc. + +Since the proxied repo uuid is communicated to git-annex-shell via +--uuid, a repo that advertises proxying for itself will be connected to +with its own uuid. No proxying is done in this case. Same happens with a +larger cycle. + +Instantiating remotes needs to identity cycles and break them. Otherwise +it would construct an infinite number of proxied remotes with names +like "foo-foo-foo-foo-..." or "foo-bar-foo-bar-..." + +Once `git-annex copy --to proxy` is implemented, and the proxy decides +where to send content that is being sent directly to it, cycles will +become an issue with that as well. + What if repo A is a proxy and has repo B as a remote. Meanwhile, repo B is a proxy and has repo A as a remote? @@ -259,7 +390,7 @@ remote that is not part of a cycle, they could deposit the upload there and the upload still succeed. Otherwise the upload would fail, which is probably the best that can be done with such a broken configuration. -So, it seems like proxies will need to take transfer locks for uploads, +So, it seems like proxies would need to take transfer locks for uploads, even though the content is being proxied to elsewhere. Dropping could have similar cycles with content presence locking, which diff --git a/doc/todo/git-annex_proxies.mdwn b/doc/todo/git-annex_proxies.mdwn index 69257fcb9e..b63fc865ae 100644 --- a/doc/todo/git-annex_proxies.mdwn +++ b/doc/todo/git-annex_proxies.mdwn @@ -26,52 +26,45 @@ In development on the `proxy` branch. For June's work on [[design/passthrough_proxy]], implementation plan: -1. UUID discovery via git-annex branch. Add a log file listing UUIDs - accessible via proxy UUIDs. It also will contain the names - of the remotes that the proxy is a proxy for, - from the perspective of the proxy. (done) +* UUID discovery via git-annex branch. Add a log file listing UUIDs + accessible via proxy UUIDs. It also will contain the names + of the remotes that the proxy is a proxy for, + from the perspective of the proxy. (done) -1. Add `git-annex updateproxy` command and remote.name.annex-proxy - configuration. (done) +* Add `git-annex updateproxy` command and remote.name.annex-proxy + configuration. (done) -2. Remote instantiation for proxies. (done) +* Remote instantiation for proxies. (done) -2. Bug: In a repo cloned with ssh from a proxy repo, - running `git-annex init` sets annex-uuid for the instantiated remotes. - This prevents them being used, because instanatiation is not done - when there's any config set for a remote. +* Implement git-annex-shell proxying to git remotes. (done) -3. Implement proxying in git-annex-shell. - (Partly done, still need it for GET, PUT, CONNECT, and NOTIFYCHANGES - messages.) +* Proxy should update location tracking information for proxied remotes, + so it is available to other users who sync with it. (done) -4. Either implement proxying for local path remotes, or prevent - listProxied from operating on them. +* Consider getting instantiated remotes into git remote list. + See design. -4. Either implement proxying for tor-annex remotes, or prevent - listProxied from operating on them. +* Implement single upload with fanout to proxied remotes. -4. Let `storeKey` return a list of UUIDs where content was stored, - and make proxies accept uploads directed at them, rather than a specific - instantiated remote, and fan out the upload to whatever nodes behind - the proxy want it. This will need P2P protocol extensions. +* Implement clusters. -5. Make `git-annex copy --from $proxy` pick a node that contains each - file, and use the instantiated remote for getting the file. Same for - similar commands. +* Support proxies-of-proxies better, eg foo-bar-baz. + Currently, it does work, but have to run `git-annex updateproxy` + on foo in order for it to notice the bar-baz proxied remote exists, + and record it as foo-bar-baz. Make it skip recording proxies of + proxies like that, and instead automatically generate those from the log. + (With cycle prevention there of course.) -6. Make `git-annex drop --from $proxy` drop, when possible, from every - remote accessible by the proxy. Communicate partial drops somehow. +* Cycle prevention. See design. -7. Make commands like `git-annex push` not iterate over instantiate - remotes, and instead just send content to the proxy for fanout. +* Optimise proxy speed. See design for ideas. -8. Optimise proxy speed. See design for idea. +* Use `sendfile()` to avoid data copying overhead when + `receiveBytes` is being fed right into `sendBytes`. -9. Encryption and chunking. See design for issues. +* Encryption and chunking. See design for issues. -10. Cycle prevention. See design. +* Indirect uploads (to be considered). See design. -11. indirect uploads (to be considered). See design. - -12. Support using a proxy when its url is a P2P address. +* Support using a proxy when its url is a P2P address. + (Eg tor-annex remotes.) From 6ea78ec86706bcc020b1a77dfdb2b73bb9aba0d6 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 13 Jun 2024 13:03:38 -0400 Subject: [PATCH 5/9] partial reproducer --- ..._c68cdec52b134a775cc9d84daa75b4f8._comment | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_3_c68cdec52b134a775cc9d84daa75b4f8._comment diff --git a/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_3_c68cdec52b134a775cc9d84daa75b4f8._comment b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_3_c68cdec52b134a775cc9d84daa75b4f8._comment new file mode 100644 index 0000000000..4ea531db32 --- /dev/null +++ b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_3_c68cdec52b134a775cc9d84daa75b4f8._comment @@ -0,0 +1,66 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2024-06-13T16:31:57Z" + content=""" +First I wanted to see if I could get this to happen without the assistant. + + joey@darkstar:~/tmp/y>echo '/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4' > new + joey@darkstar:~/tmp/y>git annex add new + add new ok + joey@darkstar:~/tmp/y>git annex find --format='${key}\n' new + SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4 + + joey@darkstar:~/tmp/y>git config annex.largefiles anything + joey@darkstar:~/tmp/y>echo '/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4' > new2 + joey@darkstar:~/tmp/y>git add new2 + joey@darkstar:~/tmp/y>git annex find --format='${key}\n' new2 + SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4 + +So no, it must be only the assistant that can mess up and add an annexed +link to the annex. + +Secondly, here's a way to manually create a repository with this behavior +w/o using the assistant. + + joey@darkstar:~/tmp/y>git remote add z ../z + joey@darkstar:~/tmp/y>git-annex move --key SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4 --to z + joey@darkstar:~/tmp/y>echo '/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4' > funkyobj + joey@darkstar:~/tmp/y>git-annex setkey WORM--foo funkyobj + setkey funkyobj ok + joey@darkstar:~/tmp/y>echo '/annex/objects/WORM--foo' > funky + joey@darkstar:~/tmp/y>git add funky + git-annex: git status will show funky to be modified, since content availability has changed and git-annex was unable to update the index. This is only a cosmetic problem affecting git status; git add, git commit, etc won't be affected. To fix the git status display, you can run: git-annex restage + joey@darkstar:~/tmp/y>git commit -m add funky + joey@darkstar:~/tmp/y>git annex find --format='${key}\n' funky + WORM--foo + joey@darkstar:~/tmp/y>cat funky + /annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4 + joey@darkstar:~/tmp/y>git-annex get funky + joey@darkstar:~/tmp/y> + +Nothing has gone wrong yet, funky is an unlocked file and it happens to have +the content of an annex pointer file, but git-annex is not treating that +content *as* an annex pointer file. If it were, the `git-annex get funky` above +would get the SHA256 key from remote x. + +But in a fresh clone, it's another story: + + joey@darkstar:~/tmp>git clone y x + joey@darkstar:~/tmp>cd x + joey@darkstar:~/tmp/x>git remote add z ../z + joey@darkstar:~/tmp/x>cat funky + /annex/objects/WORM--foo + joey@darkstar:~/tmp/x>git-annex get funky + get funky (from origin...) + ok + (recording state in git...) + joey@darkstar:~/tmp/x>git-annex get funky + get funky (from z...) + ok + (recording state in git...) + joey@darkstar:~/tmp/x>cat funky + Thu Jun 13 12:30:17 JEST 2024 + +Which reproduces what you showed. I think this on its own is a bug, leaving aside whatever caused the assistant to generate this. +"""]] From ebebc04273207428196f3ed98c3acf344c2c76df Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 13 Jun 2024 13:40:04 -0400 Subject: [PATCH 6/9] comment --- ..._5ec1ab77318889c1545f4881ab6e44e9._comment | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_4_5ec1ab77318889c1545f4881ab6e44e9._comment diff --git a/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_4_5ec1ab77318889c1545f4881ab6e44e9._comment b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_4_5ec1ab77318889c1545f4881ab6e44e9._comment new file mode 100644 index 0000000000..17387f0088 --- /dev/null +++ b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_4_5ec1ab77318889c1545f4881ab6e44e9._comment @@ -0,0 +1,38 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 4""" + date="2024-06-13T17:07:02Z" + content=""" +`git-annex add` (and smudge) use `isPointerFile` to check if a file that is +being added is an annex pointer file. And in that case they stage the +pointer file, rather than injecting it into the annex. + +The assistant also checks `isPointerFile` though. And in the simple case, +it also commits a newly added pointer file correctly: + + joey@darkstar:~/tmp/b2/a>git-annex assistant + joey@darkstar:~/tmp/b2/a>echo '/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4' > new + joey@darkstar:~/tmp/b2/a>git show|tail -n 1 + +/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4 + +So this makes me think of a race condition. What if the file is not a pointer +file when the assistant checks `isPointerFile`. But then it gets turned into +one before it ingests it. + +In `git-annex add`, it first stats the file before checking if it's a pointer +file, and later it checks if the file has changed while it was being added, +which should avoid such races. + +Looking at the assistant, I'm not at all confident it handles such a race. + +It might even be another thread of the assistant that triggered the race. +Could be that something caused the assistant to drop the file, +then get it again, then drop it again. (Eg something wrong with +configuration causing a non-stable state... like "not present" in preferred +content). + +I've tried running a get/drop/get/drop loop while the assistant is running, +and have not seen this happen to a file yet. But the race window is probably small. +An interesting thing I did notice is that sometimes when such a loop runs for a while, +the file will be left as a pointer file after `git-annex get`. +"""]] From d16e19b8ca8785600c05c303da789f1f8d619cbd Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 13 Jun 2024 14:30:32 -0400 Subject: [PATCH 7/9] comment --- ..._7a14589a7ca4957ae758e342cc7b4596._comment | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_5_7a14589a7ca4957ae758e342cc7b4596._comment diff --git a/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_5_7a14589a7ca4957ae758e342cc7b4596._comment b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_5_7a14589a7ca4957ae758e342cc7b4596._comment new file mode 100644 index 0000000000..f9abe7780e --- /dev/null +++ b/doc/bugs/assistant___40__webapp__41___commited_unlocked_link_to_annex/comment_5_7a14589a7ca4957ae758e342cc7b4596._comment @@ -0,0 +1,91 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 5""" + date="2024-06-13T18:01:01Z" + content=""" +Looking at the behavior of `git-annex get`, the first one leaves the index +in a diff state: + + joey@darkstar:~/tmp/b2/x>git-annex get funky + get funky (from origin...) + ok + (recording state in git...) + joey@darkstar:~/tmp/b2/x>git diff --cached + diff --git a/funky b/funky + index a8813f1..9488a18 100644 + --- a/funky + +++ b/funky + @@ -1 +1 @@ + -/annex/objects/WORM--foo + +/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4 + +To the second `git-annex get`, this is indistinguishable from a different +unlocked file having been moved over top of funky. So the behavior of the +second one is fine. + +The problem is with the first `git-annex get` leaving the index in that state. + +What's happening is, it doesn't restage the index, because the restage +itself can't tell the difference between this state and an unlocked file having +been moved over top of funky. In particular, `git update-index --refresh --stdin` +when run after the first `git-annex get`, and fed "funky", leaves the index in diff state. + + joey@darkstar:~/tmp/b2/x>touch funky + joey@darkstar:~/tmp/b2/x>echo funky | GIT_TRACE=1 git update-index --refresh --stdin + 14:14:33.911458 git.c:465 trace: built-in: git update-index --refresh --stdin + 14:14:33.911759 run-command.c:657 trace: run_command: 'git-annex filter-process' + 14:14:33.917118 git.c:465 trace: built-in: git config --null --list + 14:14:33.919641 git.c:465 trace: built-in: git show-ref git-annex + 14:14:33.921390 git.c:465 trace: built-in: git show-ref --hash refs/heads/git-annex + 14:14:33.925579 git.c:465 trace: built-in: git cat-file --batch + 14:14:33.927011 run-command.c:50 trace: run_command: running exit handler for pid 1164525 + joey@darkstar:~/tmp/b2/x>git status --short + M funky + +So git update-index is running `git-annex filter-process`, which is doing +the same as `git-annex smudge --clean funky` in this case. +And in Command.Smudge.clean, there is a `parseLinkTargetOrPointerLazy'` call +which is intended to avoid storing a pointer file in the annex... The very +thing that the assistant is somehow incorrectly doing. In this case +though, that notices that funky's content looks like an annex pointer file, +so it outputs that pointer. So git stages that pointer. + +To avoid this, the first `git-annex get` would need to notice that the +content it got looks like a pointer file. And it would need to communicate +that through the `git update-index` somehow to `git-annex filter-process`. Then +when that saw the same pointer file, it could output the original key, and +this situation would be avoided. Also bear in mind that the +`git update-index` can be interrupted and get restarted later and +it would still need to remember that it was dealing with this case then. +This seems... doable, but it will not be easy. + +PS, Full script to synthesize a repository with this situation follows: + + git init z + cd z + git-annex init + git commit --allow-empty -m created + cd .. + git clone z y + cd y + git-annex init + echo 'Thu Jun 13 12:30:17 JEST 2024' > foo + git-annex add foo + git commit -m added + git-annex move --foo --to origin + git rm foo + git commit -m removed + echo '/annex/objects/SHA256E-s30--93c16dbf65b7b66e479bd484398c09c920338e4a1df1fe352b245078d04645f4' > funkyobj + git-annex setkey WORM--foo funkyobj + echo '/annex/objects/WORM--foo' > funky + git add funky + git commit -m add\ funky + git annex find --format='${key}\n' funky + git-annex get funky + cd .. + git clone y x + cd x + git remote add z ../z + git-annex get funky + git-annex get funky +"""]] From 9895e6659d58e2869a960fa83dfa0486aee874e0 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 13 Jun 2024 19:08:04 -0400 Subject: [PATCH 8/9] update --- doc/thanks/list | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/thanks/list b/doc/thanks/list index 9089bb87af..5682043b13 100644 --- a/doc/thanks/list +++ b/doc/thanks/list @@ -118,3 +118,5 @@ Stephen Seo, Antoine Balaine, mycroft, Lerrr, +Eve, +Marco, From af79728ac310b327876900c05564e3df6abd452a Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 17 Jun 2024 09:26:03 -0400 Subject: [PATCH 9/9] tab complete special remotes An oversight.. And with the work in progress proxy and cluster, there can be additional remotes that are not listed in .git/config, but are available. Making those more discoverable is another big benefit of this. --- CHANGELOG | 1 + CmdLine/GitAnnex/Options.hs | 33 +++++++++++++++++++++++++-------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index e42f967b5b..6c5864aa3f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,7 @@ git-annex (10.20240532) UNRELEASED; urgency=medium * Fix a bug where interrupting git-annex while it is updating the git-annex branch for an export could later lead to git fsck complaining about missing tree objects. + * Tab completion of options like --from now includes special remotes. * Fix Windows build with Win32 2.13.4+ Thanks, Oleg Tolmatcev diff --git a/CmdLine/GitAnnex/Options.hs b/CmdLine/GitAnnex/Options.hs index ed6bca99c7..80ed6c2575 100644 --- a/CmdLine/GitAnnex/Options.hs +++ b/CmdLine/GitAnnex/Options.hs @@ -40,6 +40,7 @@ import qualified Types.Backend as Backend import Utility.HumanTime import Utility.DataUnits import Annex.Concurrent +import Remote.List -- Options that are accepted by all git-annex sub-commands, -- although not always used. @@ -569,14 +570,30 @@ parseDaemonOptions canstop ) completeRemotes :: HasCompleter f => Mod f a -completeRemotes = completer $ mkCompleter $ \input -> do - r <- maybe (pure Nothing) (Just <$$> Git.Config.read) - =<< Git.Construct.fromCwd - return $ filter (input `isPrefixOf`) $ - mapMaybe remoteKeyToRemoteName $ - filter isRemoteUrlKey $ - maybe [] (M.keys . config) r - +completeRemotes = completer $ mkCompleter $ \input -> + Git.Construct.fromCwd >>= \case + Nothing -> return [] + Just g -> completeRemotes' g input + +completeRemotes' :: Repo -> [Char] -> IO [[Char]] +completeRemotes' g input = do + g' <- Git.Config.read g + state <- Annex.new g' + Annex.eval state $ do + Annex.setOutput QuietOutput + gc <- Annex.getGitConfig + if isinitialized gc + then do + rs <- remoteList + matches $ map Remote.name rs + else matches $ + mapMaybe remoteKeyToRemoteName $ + filter isRemoteUrlKey $ + M.keys $ config g + where + isinitialized gc = annexUUID gc /= NoUUID && isJust (annexVersion gc) + matches = return . filter (input `isPrefixOf`) + completeBackends :: HasCompleter f => Mod f a completeBackends = completeWith $ map (decodeBS . formatKeyVariety . Backend.backendVariety) Backend.builtinList