diff --git a/Assistant/TransferQueue.hs b/Assistant/TransferQueue.hs index f1df845f42..6a44732622 100644 --- a/Assistant/TransferQueue.hs +++ b/Assistant/TransferQueue.hs @@ -92,7 +92,7 @@ queueTransfersMatching matching reason schedule k f direction filter (\r -> not (inset s r || Remote.readonly r)) (syncDataRemotes st) where - locs = S.fromList <$> Remote.keyLocations k + locs = S.fromList . map Remote.uuid <$> Remote.keyPossibilities k inset s r = S.member (Remote.uuid r) s gentransfer r = Transfer { transferDirection = direction diff --git a/CHANGELOG b/CHANGELOG index 54a244cc11..dd1870c254 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,8 @@ git-annex (6.20180720) UNRELEASED; urgency=medium * S3: Support credential-less download from remotes configured with public=yes exporttree=yes. * Fix reversion in display of http 404 errors. + * Added remote.name.annex-speculate-present config that can be used to + make cache remotes. -- Joey Hess Tue, 31 Jul 2018 12:14:11 -0400 diff --git a/Command/Sync.hs b/Command/Sync.hs index 0fb3bdc3fa..4442ed499f 100644 --- a/Command/Sync.hs +++ b/Command/Sync.hs @@ -616,7 +616,7 @@ seekSyncContent o rs = do -} syncFile :: Either (Maybe (Bloom Key)) (Key -> Annex ()) -> [Remote] -> AssociatedFile -> Key -> Annex Bool syncFile ebloom rs af k = onlyActionOn' k $ do - locs <- Remote.keyLocations k + locs <- map Remote.uuid <$> Remote.keyPossibilities k let (have, lack) = partition (\r -> Remote.uuid r `elem` locs) rs got <- anyM id =<< handleget have diff --git a/Remote.hs b/Remote.hs index ff891962a1..842c3bc606 100644 --- a/Remote.hs +++ b/Remote.hs @@ -1,6 +1,6 @@ {- git-annex remotes - - - Copyright 2011 Joey Hess + - Copyright 2011-2018 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -278,13 +278,21 @@ keyLocations key = trustExclude DeadTrusted =<< loggedLocations key {- Cost ordered lists of remotes that the location log indicates - may have a key. + - + - Also includes remotes with remoteAnnexSpeculatePresent set. -} keyPossibilities :: Key -> Annex [Remote] keyPossibilities key = do u <- getUUID -- uuids of all remotes that are recorded to have the key locations <- filter (/= u) <$> keyLocations key - fst <$> remoteLocations locations [] + speclocations <- map uuid + . filter (remoteAnnexSpeculatePresent . gitconfig) + <$> remoteList + -- there are unlikely to be many speclocations, so building a Set + -- is not worth the expense + let locations' = speclocations ++ filter (`notElem` speclocations) locations + fst <$> remoteLocations locations' [] {- Given a list of locations of a key, and a list of all - trusted repositories, generates a cost-ordered list of diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index 26ad354c8d..4475abf58b 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -226,6 +226,7 @@ data RemoteGitConfig = RemoteGitConfig , remoteAnnexStartCommand :: Maybe String , remoteAnnexStopCommand :: Maybe String , remoteAnnexAvailability :: Maybe Availability + , remoteAnnexSpeculatePresent :: Bool , remoteAnnexBare :: Maybe Bool , remoteAnnexRetry :: Maybe Integer , remoteAnnexRetryDelay :: Maybe Seconds @@ -281,6 +282,7 @@ extractRemoteGitConfig r remotename = do , remoteAnnexStartCommand = notempty $ getmaybe "start-command" , remoteAnnexStopCommand = notempty $ getmaybe "stop-command" , remoteAnnexAvailability = getmayberead "availability" + , remoteAnnexSpeculatePresent = getbool "speculate-present" False , remoteAnnexBare = getmaybebool "bare" , remoteAnnexRetry = getmayberead "retry" , remoteAnnexRetryDelay = Seconds diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 163a628c15..3d2f92f323 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -1283,6 +1283,13 @@ Here are all the supported configuration settings. Can be used to tell git-annex whether a remote is LocallyAvailable or GloballyAvailable. Normally, git-annex determines this automatically. +* `remote..annex-speculate-present` + + Make git-annex speculate that this remote may contain the content of any + file, even though its normal location tracking does not indicate that it + does. This will cause git-annex to try to get all file contents from the + remote. Can be useful in setting up a caching remote. + * `remote..annex-bare` Can be used to tell git-annex if a remote is a bare repository diff --git a/doc/tips/local_caching_of_annexed_files.mdwn b/doc/tips/local_caching_of_annexed_files.mdwn new file mode 100644 index 0000000000..b7ddb545b9 --- /dev/null +++ b/doc/tips/local_caching_of_annexed_files.mdwn @@ -0,0 +1,91 @@ +Here's how to set up a local cache of annexed files, that can be used +to avoid repeated downloads. + +An example use case: Your CI system is operating on a git-annex repository, +so every time it runs it makes a fresh clone of the repository and uses +`git-annex get` to download a lot of data into it. + +We'll create a cache repository, set it as a remote of the other git-annex +repositories, and configure git-annex to check the cache first before other +more expensive ways of retrieving content. The cache can be cleaned out +whenever you like with simple unix commands. + +Some other nice properties -- When used on a system like BTRFS with COW +support, content from the cache can populate multiple other repositories +without using any additional disk space. And, git-annex repositories that +are otherwise unrelated can share use of the cache if they happen to +contain a common file. + +You'll need git-annex 6.20180802 or newer to follow these instructions. + +## creating the cache + +First let's create a new, empty git-annex repository. It will be put in +~/.annex-cache in the example, but for best results, it in the same +filesystem as your other git-annex repositories. + + git init ~/.annex-cache + cd ~/.annex-cache + git annex init + git config annex.hardlink true + git annex untrust here + +The cache does not need to be a git annex repository; any kind of special +remote can be used as a cache too. But, using a git repository lets +annex.hardlink be used to make hard links between the cache and +repositories using it. + +The cache is made untrusted, because its contents can be cleaned at any +time; other repositories should not trust it to retain content. + +## making repositories use the cache + +Now in each git-annex repository that you want to use the cache, add it as +a remote, and configure it as follows: + + cd my-repository + git remote add cache ~/.annex-cache + git config remote.cache.annex-speculate-present true + git config remote.cache.annex-cost 10 + git config remote.cache.annex-pull false + git config remote.cache.annex-push false + +The annex-speculate-present setting is the essential part. It makes +git-annex know that the cache repository may contain the content of any +annexed file. So, when getting a file, git-annex will try the cache +repository first. + +The low annex-cost makes git-annex try to get content from the cache remote +before any other remotes. + +The annex-pull and annex-push settings prevent `git-annex sync` from +pulling and pushing to the remote. The cache repository will remain an +empty git repository (except for the content of annexed files). This means +that the same cache can be used with multiple different git-annex +repositories, without intermingling their git data. You should also avoid +manual `git pull` and `git push` to the cache remote. + +## populating the cache + +For the cache to be used, you need to get file contents into it somehow. +A simple way to do that is, in a git-annex repository that already +contains the content of files: + + git annex copy --to cache + +You could run that anytime after you get content. There are also ways to +automate it, but getting some files into the cache manually is a good +enough start. + +## cleaning the cache + +XXX find + +## automatically populating the cache + +XXX + +## more caches + +The example above used a local cache on the same system. However, it's also +possible to have a cache repository shared amoung computers on a LAN.