2019-02-21 21:32:59 +00:00
|
|
|
{- git-annex import from remotes
|
|
|
|
-
|
2024-01-02 18:10:52 +00:00
|
|
|
- Copyright 2019-2024 Joey Hess <id@joeyh.name>
|
2019-02-21 21:32:59 +00:00
|
|
|
-
|
|
|
|
- Licensed under the GNU AGPL version 3 or higher.
|
|
|
|
-}
|
|
|
|
|
2019-02-27 17:15:02 +00:00
|
|
|
{-# LANGUAGE OverloadedStrings #-}
|
2019-02-26 19:25:28 +00:00
|
|
|
|
2019-02-23 19:47:55 +00:00
|
|
|
module Annex.Import (
|
|
|
|
ImportTreeConfig(..),
|
|
|
|
ImportCommitConfig(..),
|
|
|
|
buildImportCommit,
|
2019-02-26 19:25:28 +00:00
|
|
|
buildImportTrees,
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
recordImportTree,
|
2020-09-28 19:29:08 +00:00
|
|
|
canImportKeys,
|
2023-05-31 19:45:23 +00:00
|
|
|
ImportResult(..),
|
2023-06-01 17:46:16 +00:00
|
|
|
Imported,
|
2023-05-31 19:45:23 +00:00
|
|
|
importChanges,
|
2020-07-03 17:41:57 +00:00
|
|
|
importKeys,
|
2019-05-21 18:38:00 +00:00
|
|
|
makeImportMatcher,
|
2020-09-30 14:10:03 +00:00
|
|
|
getImportableContents,
|
2019-02-23 19:47:55 +00:00
|
|
|
) where
|
2019-02-21 21:32:59 +00:00
|
|
|
|
|
|
|
import Annex.Common
|
|
|
|
import Types.Import
|
2019-02-26 17:11:25 +00:00
|
|
|
import qualified Types.Remote as Remote
|
2019-02-21 21:32:59 +00:00
|
|
|
import Git.Types
|
|
|
|
import Git.Tree
|
2019-02-22 16:41:17 +00:00
|
|
|
import Git.Sha
|
2019-02-21 21:32:59 +00:00
|
|
|
import Git.FilePath
|
2019-04-24 19:13:07 +00:00
|
|
|
import Git.History
|
2023-05-31 19:45:23 +00:00
|
|
|
import qualified Git.DiffTree
|
2019-02-22 16:41:17 +00:00
|
|
|
import qualified Git.Ref
|
|
|
|
import qualified Git.Branch
|
|
|
|
import qualified Annex
|
2019-02-21 21:32:59 +00:00
|
|
|
import Annex.Link
|
2019-02-22 16:41:17 +00:00
|
|
|
import Annex.LockFile
|
2019-02-27 17:15:02 +00:00
|
|
|
import Annex.Content
|
2019-03-01 17:26:15 +00:00
|
|
|
import Annex.Export
|
2019-05-01 17:13:00 +00:00
|
|
|
import Annex.RemoteTrackingBranch
|
2020-06-23 20:07:18 +00:00
|
|
|
import Annex.HashObject
|
2020-09-04 17:49:57 +00:00
|
|
|
import Annex.Transfer
|
2020-09-30 14:41:59 +00:00
|
|
|
import Annex.CheckIgnore
|
2023-06-01 17:46:16 +00:00
|
|
|
import Annex.CatFile
|
2020-12-23 19:21:33 +00:00
|
|
|
import Annex.VectorClock
|
2024-08-08 19:13:12 +00:00
|
|
|
import Annex.SpecialRemote.Config
|
2019-03-08 16:33:44 +00:00
|
|
|
import Command
|
2019-02-27 17:15:02 +00:00
|
|
|
import Backend
|
|
|
|
import Types.Key
|
|
|
|
import Types.KeySource
|
2019-03-08 16:43:03 +00:00
|
|
|
import Messages.Progress
|
2019-02-27 17:15:02 +00:00
|
|
|
import Utility.DataUnits
|
2019-06-25 15:37:52 +00:00
|
|
|
import Utility.Metered
|
2023-05-31 16:31:14 +00:00
|
|
|
import Utility.Hash (sha1s)
|
2023-05-31 19:45:23 +00:00
|
|
|
import Logs.Import
|
2019-02-22 16:41:17 +00:00
|
|
|
import Logs.Export
|
2019-02-27 17:58:03 +00:00
|
|
|
import Logs.Location
|
2019-05-21 18:38:00 +00:00
|
|
|
import Logs.PreferredContent
|
|
|
|
import Types.FileMatcher
|
|
|
|
import Annex.FileMatcher
|
2020-09-28 17:22:16 +00:00
|
|
|
import qualified Utility.Matcher
|
2019-02-26 19:25:28 +00:00
|
|
|
import qualified Database.Export as Export
|
2019-03-06 22:04:30 +00:00
|
|
|
import qualified Database.ContentIdentifier as CIDDb
|
|
|
|
import qualified Logs.ContentIdentifier as CIDLog
|
2020-06-11 20:07:36 +00:00
|
|
|
import Backend.Utilities
|
2019-02-26 19:25:28 +00:00
|
|
|
|
|
|
|
import Control.Concurrent.STM
|
|
|
|
import qualified Data.Map.Strict as M
|
2019-03-08 16:33:44 +00:00
|
|
|
import qualified Data.Set as S
|
2020-09-30 14:10:03 +00:00
|
|
|
import qualified System.FilePath.Posix.ByteString as Posix
|
2019-12-09 17:49:05 +00:00
|
|
|
import qualified System.FilePath.ByteString as P
|
2023-05-31 16:31:14 +00:00
|
|
|
import qualified Data.ByteArray.Encoding as BA
|
2019-03-04 20:02:56 +00:00
|
|
|
|
2019-02-23 19:47:55 +00:00
|
|
|
{- Configures how to build an import tree. -}
|
|
|
|
data ImportTreeConfig
|
|
|
|
= ImportTree
|
|
|
|
-- ^ Import the tree as-is from the remote.
|
|
|
|
| ImportSubTree TopFilePath Sha
|
|
|
|
-- ^ Import a tree from the remote and graft it into a subdirectory
|
|
|
|
-- of the existing tree whose Sha is provided, replacing anything
|
|
|
|
-- that was there before.
|
|
|
|
deriving (Show)
|
|
|
|
|
|
|
|
{- Configures how to build an import commit. -}
|
|
|
|
data ImportCommitConfig = ImportCommitConfig
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
{ importCommitTracking :: Maybe Sha
|
|
|
|
-- ^ Current commit on the remote tracking branch.
|
2019-02-23 19:47:55 +00:00
|
|
|
, importCommitMode :: Git.Branch.CommitMode
|
2024-04-09 16:56:47 +00:00
|
|
|
, importCommitMessages :: [String]
|
2019-02-23 19:47:55 +00:00
|
|
|
}
|
|
|
|
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
{- Buils a commit for an import from a special remote.
|
2019-02-21 21:32:59 +00:00
|
|
|
-
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
- When there are no changes to make (importCommitTracking
|
|
|
|
- already matches what was imported), returns Nothing.
|
2019-02-23 19:47:55 +00:00
|
|
|
-
|
2019-02-21 21:32:59 +00:00
|
|
|
- After importing from a remote, exporting the same thing back to the
|
2019-02-22 16:41:17 +00:00
|
|
|
- remote should be a no-op. So, the export log and database are
|
|
|
|
- updated to reflect the imported tree.
|
2019-02-21 21:32:59 +00:00
|
|
|
-
|
2019-02-23 19:47:55 +00:00
|
|
|
- This does not download any content from a remote. But since it needs the
|
2019-02-22 16:41:17 +00:00
|
|
|
- Key of imported files to be known, its caller will have to first download
|
2019-02-21 21:32:59 +00:00
|
|
|
- new files in order to generate keys for them.
|
|
|
|
-}
|
|
|
|
buildImportCommit
|
2019-02-22 16:41:17 +00:00
|
|
|
:: Remote
|
2019-02-23 19:47:55 +00:00
|
|
|
-> ImportTreeConfig
|
|
|
|
-> ImportCommitConfig
|
2023-06-01 17:46:16 +00:00
|
|
|
-> Imported
|
2019-02-26 17:11:25 +00:00
|
|
|
-> Annex (Maybe Ref)
|
2023-06-01 17:46:16 +00:00
|
|
|
buildImportCommit remote importtreeconfig importcommitconfig imported =
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
case importCommitTracking importcommitconfig of
|
2019-04-24 19:13:07 +00:00
|
|
|
Nothing -> go Nothing
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
Just trackingcommit -> inRepo (Git.Ref.tree trackingcommit) >>= \case
|
2019-04-24 19:13:07 +00:00
|
|
|
Nothing -> go Nothing
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
Just _ -> go (Just trackingcommit)
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
where
|
|
|
|
go trackingcommit = do
|
2023-06-01 17:46:16 +00:00
|
|
|
(importedtree, updatestate) <- recordImportTree remote importtreeconfig imported
|
|
|
|
buildImportCommit' remote importcommitconfig trackingcommit importedtree >>= \case
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
Just finalcommit -> do
|
|
|
|
updatestate
|
|
|
|
return (Just finalcommit)
|
|
|
|
Nothing -> return Nothing
|
|
|
|
|
|
|
|
{- Builds a tree for an import from a special remote.
|
|
|
|
-
|
|
|
|
- Also returns an action that can be used to update
|
|
|
|
- all the other state to record the import.
|
|
|
|
-}
|
|
|
|
recordImportTree
|
|
|
|
:: Remote
|
|
|
|
-> ImportTreeConfig
|
2023-06-01 17:46:16 +00:00
|
|
|
-> Imported
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
-> Annex (History Sha, Annex ())
|
2023-06-01 17:46:16 +00:00
|
|
|
recordImportTree remote importtreeconfig imported = do
|
|
|
|
importedtree@(History finaltree _) <- buildImportTrees basetree subdir imported
|
|
|
|
return (importedtree, updatestate finaltree)
|
2019-02-21 21:32:59 +00:00
|
|
|
where
|
2019-02-23 19:47:55 +00:00
|
|
|
basetree = case importtreeconfig of
|
|
|
|
ImportTree -> emptyTree
|
|
|
|
ImportSubTree _ sha -> sha
|
|
|
|
subdir = case importtreeconfig of
|
|
|
|
ImportTree -> Nothing
|
|
|
|
ImportSubTree dir _ -> Just dir
|
|
|
|
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
updatestate finaltree = do
|
2019-03-11 17:44:23 +00:00
|
|
|
importedtree <- case subdir of
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
Nothing -> pure finaltree
|
2019-03-11 17:44:23 +00:00
|
|
|
Just dir ->
|
|
|
|
let subtreeref = Ref $
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
fromRef' finaltree
|
2020-04-07 21:41:09 +00:00
|
|
|
<> ":"
|
|
|
|
<> getTopFilePath dir
|
2019-03-11 17:44:23 +00:00
|
|
|
in fromMaybe emptyTree
|
|
|
|
<$> inRepo (Git.Ref.tree subtreeref)
|
|
|
|
updateexportdb importedtree
|
|
|
|
oldexport <- updateexportlog importedtree
|
|
|
|
updatelocationlog oldexport importedtree
|
2019-04-24 19:13:07 +00:00
|
|
|
|
2019-03-07 19:59:44 +00:00
|
|
|
updateexportdb importedtree = do
|
|
|
|
db <- Export.openDb (Remote.uuid remote)
|
|
|
|
Export.writeLockDbWhile db $ do
|
2019-02-22 16:41:17 +00:00
|
|
|
prevtree <- liftIO $ fromMaybe emptyTree
|
2019-02-26 19:25:28 +00:00
|
|
|
<$> Export.getExportTreeCurrent db
|
2019-02-22 16:41:17 +00:00
|
|
|
when (importedtree /= prevtree) $ do
|
2019-02-27 19:29:41 +00:00
|
|
|
Export.updateExportDb db prevtree importedtree
|
2019-02-26 19:25:28 +00:00
|
|
|
liftIO $ Export.recordExportTreeCurrent db importedtree
|
2019-03-07 19:59:44 +00:00
|
|
|
Export.closeDb db
|
2019-02-23 19:47:55 +00:00
|
|
|
|
2019-02-22 16:41:17 +00:00
|
|
|
updateexportlog importedtree = do
|
2019-03-01 17:26:15 +00:00
|
|
|
oldexport <- getExport (Remote.uuid remote)
|
2020-12-23 18:27:38 +00:00
|
|
|
recordExport (Remote.uuid remote) importedtree $ ExportChange
|
2019-03-01 17:26:15 +00:00
|
|
|
{ oldTreeish = exportedTreeishes oldexport
|
2019-02-22 16:41:17 +00:00
|
|
|
, newTreeish = importedtree
|
|
|
|
}
|
2019-03-01 17:26:15 +00:00
|
|
|
return oldexport
|
|
|
|
|
|
|
|
-- downloadImport takes care of updating the location log
|
|
|
|
-- for the local repo when keys are downloaded, and also updates
|
|
|
|
-- the location log for the remote for keys that are present in it.
|
|
|
|
-- That leaves updating the location log for the remote for keys
|
|
|
|
-- that have had the last copy of their content removed from it.
|
|
|
|
--
|
|
|
|
-- This must run after the export database has been updated
|
|
|
|
-- and flushed to disk, so it can query it.
|
|
|
|
updatelocationlog oldexport finaltree = do
|
|
|
|
let stillpresent db k = liftIO $ not . null
|
|
|
|
<$> Export.getExportedLocation db k
|
2021-03-05 18:03:51 +00:00
|
|
|
let updater db moldkey _newkey _ = case moldkey of
|
2021-03-05 18:17:48 +00:00
|
|
|
Just oldkey | not (isGitShaKey oldkey) ->
|
|
|
|
unlessM (stillpresent db oldkey) $
|
2024-08-23 20:35:12 +00:00
|
|
|
logChange NoLiveUpdate oldkey (Remote.uuid remote) InfoMissing
|
2021-03-05 18:17:48 +00:00
|
|
|
_ -> noop
|
2022-10-11 17:04:33 +00:00
|
|
|
-- When the remote is versioned, it still contains keys
|
|
|
|
-- that are not present in the new tree.
|
2024-08-08 19:13:12 +00:00
|
|
|
unless (isVersioning (Remote.config remote)) $ do
|
2022-10-11 17:04:33 +00:00
|
|
|
db <- Export.openDb (Remote.uuid remote)
|
|
|
|
forM_ (exportedTreeishes oldexport) $ \oldtree ->
|
|
|
|
Export.runExportDiffUpdater updater db oldtree finaltree
|
|
|
|
Export.closeDb db
|
2019-02-21 21:32:59 +00:00
|
|
|
|
2019-05-20 20:37:04 +00:00
|
|
|
buildImportCommit' :: Remote -> ImportCommitConfig -> Maybe Sha -> History Sha -> Annex (Maybe Sha)
|
|
|
|
buildImportCommit' remote importcommitconfig mtrackingcommit imported@(History ti _) =
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
case mtrackingcommit of
|
2019-05-21 15:32:54 +00:00
|
|
|
Nothing -> Just <$> mkcommitsunconnected imported
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
Just trackingcommit -> do
|
|
|
|
-- Get history of tracking branch to at most
|
2019-05-01 18:20:26 +00:00
|
|
|
-- one more level deep than what was imported,
|
|
|
|
-- so we'll have enough history to compare,
|
|
|
|
-- but not spend too much time getting it.
|
2019-05-21 15:32:54 +00:00
|
|
|
let maxdepth = succ importeddepth
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
inRepo (getHistoryToDepth maxdepth trackingcommit)
|
|
|
|
>>= go trackingcommit
|
2019-04-26 14:17:02 +00:00
|
|
|
where
|
2019-05-21 15:32:54 +00:00
|
|
|
go _ Nothing = Just <$> mkcommitsunconnected imported
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
go trackingcommit (Just h)
|
2019-05-01 16:37:54 +00:00
|
|
|
-- If the tracking branch head is a merge commit
|
|
|
|
-- and one side of the merge matches the history,
|
|
|
|
-- nothing new needs to be committed.
|
2019-05-21 15:32:54 +00:00
|
|
|
| t == ti && any sametodepth (S.toList s) = return Nothing
|
2019-05-01 18:20:26 +00:00
|
|
|
-- If the tracking branch matches the history,
|
|
|
|
-- nothing new needs to be committed.
|
|
|
|
-- (This is unlikely to happen.)
|
2019-05-21 15:32:54 +00:00
|
|
|
| sametodepth h' = return Nothing
|
2024-01-02 17:56:50 +00:00
|
|
|
-- If the imported tree is unchanged,
|
|
|
|
-- nothing new needs to be committed.
|
|
|
|
| otherwise = getLastImportedTree remote >>= \case
|
|
|
|
Just (LastImportedTree lasttree)
|
|
|
|
| lasttree == ti -> return Nothing
|
|
|
|
_ -> gencommit trackingcommit h
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
where
|
|
|
|
h'@(History t s) = mapHistory historyCommitTree h
|
2024-01-02 17:56:50 +00:00
|
|
|
|
|
|
|
gencommit trackingcommit h = do
|
|
|
|
importedcommit <- case getRemoteTrackingBranchImportHistory h of
|
|
|
|
Nothing -> mkcommitsunconnected imported
|
|
|
|
Just oldimported@(History oldhc _)
|
|
|
|
| importeddepth == 1 ->
|
|
|
|
mkcommitconnected imported oldimported
|
|
|
|
| otherwise -> do
|
|
|
|
let oldimportedtrees = mapHistory historyCommitTree oldimported
|
|
|
|
mknewcommits oldhc oldimportedtrees imported
|
|
|
|
ti' <- addBackExportExcluded remote ti
|
|
|
|
Just <$> makeRemoteTrackingBranchMergeCommit'
|
|
|
|
trackingcommit importedcommit ti'
|
2019-04-23 20:34:19 +00:00
|
|
|
|
2019-05-21 15:32:54 +00:00
|
|
|
importeddepth = historyDepth imported
|
|
|
|
|
|
|
|
sametodepth b = imported == truncateHistoryToDepth importeddepth b
|
2019-04-23 20:34:19 +00:00
|
|
|
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
mkcommit parents tree = inRepo $ Git.Branch.commitTree
|
|
|
|
(importCommitMode importcommitconfig)
|
2024-04-09 16:56:47 +00:00
|
|
|
(importCommitMessages importcommitconfig)
|
make import tree from remote generate a merge commit
This way no history is lost, neither what was exported to the remote,
or the history of changes that is imported from it. No complicated
correlation of two possibly very different histories is needed, just
record what we know and then git merge will do a good job.
Also, it notices when the remote tracking branch doesn't need to be updated,
and avoids doing anything, so noop remotes are super cheap.
The only catch here is that, since the commits generated for imports
from the remote don't have a stable date or author/committer, each
(non-noop) import generates different commits for the same imported
trees. So, when the imported remote tracking branch is merged into master
and then a change is imported again, there will be an extra series of
commits, which will get more and more expensive each time.
This seems to call for making stable commits for imports. Also that
seems a good idea to make importing in several repositories have the
same result.
2019-04-30 20:13:21 +00:00
|
|
|
parents
|
|
|
|
tree
|
|
|
|
|
2019-05-21 15:32:54 +00:00
|
|
|
-- Start a new history of import commits, not connected to any
|
|
|
|
-- prior import commits.
|
|
|
|
mkcommitsunconnected (History importedtree hs) = do
|
|
|
|
parents <- mapM mkcommitsunconnected (S.toList hs)
|
|
|
|
mkcommit parents importedtree
|
|
|
|
|
|
|
|
-- Commit the new history connected with the old history.
|
|
|
|
-- Used when the import is not versioned, so the history depth is 1.
|
|
|
|
mkcommitconnected (History importedtree _) (History oldhc _) = do
|
|
|
|
let parents = [historyCommit oldhc]
|
2019-05-01 18:20:26 +00:00
|
|
|
mkcommit parents importedtree
|
|
|
|
|
2019-05-01 19:34:07 +00:00
|
|
|
-- Reuse the commits from the old imported History when possible.
|
|
|
|
mknewcommits oldhc old new@(History importedtree hs)
|
|
|
|
| old == new = return $ historyCommit oldhc
|
2019-05-01 18:20:26 +00:00
|
|
|
| otherwise = do
|
2019-05-01 19:34:07 +00:00
|
|
|
parents <- mapM (mknewcommits oldhc old) (S.toList hs)
|
2019-05-01 18:20:26 +00:00
|
|
|
mkcommit parents importedtree
|
|
|
|
|
2023-06-01 17:46:16 +00:00
|
|
|
{- Builds a history of git trees for an import.
|
2019-02-22 16:41:17 +00:00
|
|
|
-
|
2023-06-01 17:46:16 +00:00
|
|
|
- When a subdir is provided, the imported tree is grafted into
|
|
|
|
- the basetree at that location, replacing any object that was there.
|
2019-02-22 16:41:17 +00:00
|
|
|
-}
|
2019-02-21 21:32:59 +00:00
|
|
|
buildImportTrees
|
2021-10-06 21:05:32 +00:00
|
|
|
:: Ref
|
|
|
|
-> Maybe TopFilePath
|
2023-06-01 17:46:16 +00:00
|
|
|
-> Imported
|
2021-10-06 21:05:32 +00:00
|
|
|
-> Annex (History Sha)
|
2023-06-01 17:46:16 +00:00
|
|
|
buildImportTrees basetree msubdir (ImportedFull imported) =
|
|
|
|
buildImportTreesGeneric convertImportTree basetree msubdir imported
|
|
|
|
buildImportTrees basetree msubdir (ImportedDiff (LastImportedTree oldtree) imported) = do
|
|
|
|
importtree <- if null (importableContents imported)
|
|
|
|
then pure oldtree
|
|
|
|
else applydiff
|
|
|
|
repo <- Annex.gitRepo
|
|
|
|
t <- withMkTreeHandle repo $
|
|
|
|
graftImportTree basetree msubdir importtree
|
|
|
|
-- Diffing is not currently implemented when the history is not empty.
|
|
|
|
return (History t mempty)
|
|
|
|
where
|
|
|
|
applydiff = do
|
|
|
|
let (removed, new) = partition isremoved
|
|
|
|
(importableContents imported)
|
|
|
|
newtreeitems <- catMaybes <$> mapM mktreeitem new
|
|
|
|
let removedfiles = map (mkloc . fst) removed
|
|
|
|
inRepo $ adjustTree
|
|
|
|
(pure . Just)
|
|
|
|
-- ^ keep files that are not added/removed the same
|
|
|
|
newtreeitems
|
|
|
|
(\_oldti newti -> newti)
|
|
|
|
-- ^ prefer newly added version of file
|
|
|
|
removedfiles
|
|
|
|
oldtree
|
|
|
|
|
|
|
|
mktreeitem (loc, DiffChanged v) =
|
|
|
|
Just <$> mkImportTreeItem msubdir loc v
|
|
|
|
mktreeitem (_, DiffRemoved) =
|
|
|
|
pure Nothing
|
|
|
|
|
|
|
|
mkloc = asTopFilePath . fromImportLocation
|
|
|
|
|
|
|
|
isremoved (_, v) = v == DiffRemoved
|
2023-05-31 16:31:14 +00:00
|
|
|
|
|
|
|
convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree
|
2023-06-01 17:46:16 +00:00
|
|
|
convertImportTree msubdir ls =
|
|
|
|
treeItemsToTree <$> mapM (uncurry $ mkImportTreeItem msubdir) ls
|
|
|
|
|
|
|
|
mkImportTreeItem :: Maybe TopFilePath -> ImportLocation -> Either Sha Key -> Annex TreeItem
|
|
|
|
mkImportTreeItem msubdir loc v = case v of
|
|
|
|
Right k -> do
|
|
|
|
relf <- fromRepo $ fromTopFilePath topf
|
|
|
|
symlink <- calcRepo $ gitAnnexLink relf k
|
|
|
|
linksha <- hashSymlink symlink
|
|
|
|
return $ TreeItem treepath (fromTreeItemType TreeSymlink) linksha
|
|
|
|
Left sha ->
|
|
|
|
return $ TreeItem treepath (fromTreeItemType TreeFile) sha
|
2023-05-31 16:31:14 +00:00
|
|
|
where
|
2023-06-01 17:46:16 +00:00
|
|
|
lf = fromImportLocation loc
|
|
|
|
treepath = asTopFilePath lf
|
|
|
|
topf = asTopFilePath $
|
|
|
|
maybe lf (\sd -> getTopFilePath sd P.</> lf) msubdir
|
2023-05-31 16:31:14 +00:00
|
|
|
|
|
|
|
{- Builds a history of git trees using ContentIdentifiers.
|
|
|
|
-
|
|
|
|
- These are not the final trees that are generated by the import, which
|
|
|
|
- use Keys. The purpose of these trees is to allow quickly determining
|
|
|
|
- which files in the import have changed, and which are unchanged, to
|
|
|
|
- avoid needing to look up the Keys for unchanged ContentIdentifiers.
|
|
|
|
- When the import has a large number of files, that can be slow.
|
|
|
|
-}
|
|
|
|
buildContentIdentifierTree
|
|
|
|
:: ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
|
2023-05-31 19:45:23 +00:00
|
|
|
-> Annex (History Sha, M.Map Sha (ContentIdentifier, ByteSize))
|
|
|
|
buildContentIdentifierTree importable = do
|
|
|
|
mv <- liftIO $ newTVarIO M.empty
|
|
|
|
r <- buildImportTreesGeneric (convertContentIdentifierTree mv) emptyTree Nothing importable
|
|
|
|
m <- liftIO $ atomically $ readTVar mv
|
|
|
|
return (r, m)
|
2023-05-31 16:31:14 +00:00
|
|
|
|
|
|
|
{- For speed, and to avoid bloating the repository, the ContentIdentifiers
|
|
|
|
- are not actually checked into git, instead a sha1 hash is calculated
|
|
|
|
- internally.
|
|
|
|
-}
|
|
|
|
convertContentIdentifierTree
|
2023-05-31 19:45:23 +00:00
|
|
|
:: TVar (M.Map Sha (ContentIdentifier, ByteSize))
|
|
|
|
-> Maybe TopFilePath
|
2023-05-31 16:31:14 +00:00
|
|
|
-> [(ImportLocation, (ContentIdentifier, ByteSize))]
|
|
|
|
-> Annex Tree
|
2023-05-31 19:45:23 +00:00
|
|
|
convertContentIdentifierTree mv _ ls = do
|
|
|
|
let (tis, ml) = unzip (map mktreeitem ls)
|
|
|
|
liftIO $ atomically $ modifyTVar' mv $
|
|
|
|
M.union (M.fromList ml)
|
|
|
|
return (treeItemsToTree tis)
|
2023-05-31 16:31:14 +00:00
|
|
|
where
|
2023-05-31 19:45:23 +00:00
|
|
|
mktreeitem (loc, v@((ContentIdentifier cid), _sz)) =
|
|
|
|
(TreeItem p mode sha1, (sha1, v))
|
2023-05-31 16:31:14 +00:00
|
|
|
where
|
|
|
|
p = asTopFilePath (fromImportLocation loc)
|
|
|
|
mode = fromTreeItemType TreeFile
|
|
|
|
-- Note that this hardcodes sha1, even if git has started
|
|
|
|
-- defaulting to some other checksum method. That should be
|
|
|
|
-- ok, hopefully. This checksum never needs to be verified
|
|
|
|
-- by git, which is why this does not bother to prefix the
|
|
|
|
-- cid with its length, like git would.
|
|
|
|
sha1 = Ref $ BA.convertToBase BA.Base16 $ sha1s cid
|
|
|
|
|
|
|
|
buildImportTreesGeneric
|
|
|
|
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
|
|
|
|
-> Ref
|
|
|
|
-> Maybe TopFilePath
|
|
|
|
-> ImportableContentsChunkable Annex v
|
|
|
|
-> Annex (History Sha)
|
|
|
|
buildImportTreesGeneric converttree basetree msubdir (ImportableContentsComplete importable) = do
|
2021-10-06 21:05:32 +00:00
|
|
|
repo <- Annex.gitRepo
|
2023-05-31 16:31:14 +00:00
|
|
|
withMkTreeHandle repo $ buildImportTreesGeneric' converttree basetree msubdir importable
|
|
|
|
buildImportTreesGeneric converttree basetree msubdir importable@(ImportableContentsChunked {}) = do
|
2021-10-06 21:05:32 +00:00
|
|
|
repo <- Annex.gitRepo
|
|
|
|
withMkTreeHandle repo $ \hdl ->
|
|
|
|
History
|
|
|
|
<$> go hdl
|
2023-05-31 16:31:14 +00:00
|
|
|
<*> buildImportTreesHistory converttree basetree msubdir
|
2021-10-06 21:05:32 +00:00
|
|
|
(importableHistoryComplete importable) hdl
|
|
|
|
where
|
|
|
|
go hdl = do
|
|
|
|
tree <- gochunks [] (importableContentsChunk importable) hdl
|
|
|
|
importtree <- liftIO $ recordTree' hdl tree
|
|
|
|
graftImportTree basetree msubdir importtree hdl
|
|
|
|
|
|
|
|
gochunks l c hdl = do
|
|
|
|
let subdir = importChunkSubDir $ importableContentsSubDir c
|
|
|
|
-- Full directory prefix where the sub tree is located.
|
|
|
|
let fullprefix = asTopFilePath $ case msubdir of
|
|
|
|
Nothing -> subdir
|
|
|
|
Just d -> getTopFilePath d Posix.</> subdir
|
2023-05-31 16:31:14 +00:00
|
|
|
Tree ts <- converttree (Just fullprefix) $
|
2021-10-06 21:05:32 +00:00
|
|
|
map (\(p, i) -> (mkImportLocation p, i))
|
|
|
|
(importableContentsSubTree c)
|
|
|
|
-- Record this subtree before getting next chunk, this
|
|
|
|
-- avoids buffering all the chunks into memory.
|
|
|
|
tc <- liftIO $ recordSubTree hdl $
|
|
|
|
NewSubTree (asTopFilePath subdir) ts
|
|
|
|
importableContentsNextChunk c >>= \case
|
|
|
|
Nothing -> return (Tree (tc:l))
|
|
|
|
Just c' -> gochunks (tc:l) c' hdl
|
|
|
|
|
2023-05-31 16:31:14 +00:00
|
|
|
buildImportTreesGeneric'
|
|
|
|
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
|
|
|
|
-> Ref
|
2019-02-22 16:41:17 +00:00
|
|
|
-> Maybe TopFilePath
|
2023-05-31 16:31:14 +00:00
|
|
|
-> ImportableContents v
|
2021-10-06 21:05:32 +00:00
|
|
|
-> MkTreeHandle
|
2019-02-21 21:32:59 +00:00
|
|
|
-> Annex (History Sha)
|
2023-05-31 16:31:14 +00:00
|
|
|
buildImportTreesGeneric' converttree basetree msubdir importable hdl = History
|
|
|
|
<$> buildImportTree converttree basetree msubdir (importableContents importable) hdl
|
|
|
|
<*> buildImportTreesHistory converttree basetree msubdir (importableHistory importable) hdl
|
2021-10-06 21:05:32 +00:00
|
|
|
|
|
|
|
buildImportTree
|
2023-05-31 16:31:14 +00:00
|
|
|
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
|
|
|
|
-> Ref
|
2021-10-06 21:05:32 +00:00
|
|
|
-> Maybe TopFilePath
|
2023-05-31 16:31:14 +00:00
|
|
|
-> [(ImportLocation, v)]
|
2021-10-06 21:05:32 +00:00
|
|
|
-> MkTreeHandle
|
|
|
|
-> Annex Sha
|
2023-05-31 16:31:14 +00:00
|
|
|
buildImportTree converttree basetree msubdir ls hdl = do
|
|
|
|
importtree <- liftIO . recordTree' hdl =<< converttree msubdir ls
|
2021-10-06 21:05:32 +00:00
|
|
|
graftImportTree basetree msubdir importtree hdl
|
|
|
|
|
|
|
|
graftImportTree
|
|
|
|
:: Ref
|
|
|
|
-> Maybe TopFilePath
|
|
|
|
-> Sha
|
|
|
|
-> MkTreeHandle
|
|
|
|
-> Annex Sha
|
|
|
|
graftImportTree basetree msubdir tree hdl = case msubdir of
|
|
|
|
Nothing -> return tree
|
|
|
|
Just subdir -> inRepo $ \repo ->
|
|
|
|
graftTree' tree subdir basetree repo hdl
|
|
|
|
|
|
|
|
buildImportTreesHistory
|
2023-05-31 16:31:14 +00:00
|
|
|
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
|
|
|
|
-> Ref
|
2021-10-06 21:05:32 +00:00
|
|
|
-> Maybe TopFilePath
|
2023-05-31 16:31:14 +00:00
|
|
|
-> [ImportableContents v]
|
2021-10-06 21:05:32 +00:00
|
|
|
-> MkTreeHandle
|
|
|
|
-> Annex (S.Set (History Sha))
|
2023-05-31 16:31:14 +00:00
|
|
|
buildImportTreesHistory converttree basetree msubdir history hdl = S.fromList
|
|
|
|
<$> mapM (\ic -> buildImportTreesGeneric' converttree basetree msubdir ic hdl) history
|
2021-10-06 21:05:32 +00:00
|
|
|
|
2020-09-28 19:29:08 +00:00
|
|
|
canImportKeys :: Remote -> Bool -> Bool
|
|
|
|
canImportKeys remote importcontent =
|
|
|
|
importcontent || isJust (Remote.importKey ia)
|
|
|
|
where
|
|
|
|
ia = Remote.importActions remote
|
|
|
|
|
2023-06-01 17:46:16 +00:00
|
|
|
-- Result of an import. ImportUnfinished indicates that some file failed to
|
|
|
|
-- be imported. Running again should resume where it left off.
|
|
|
|
data ImportResult t
|
|
|
|
= ImportFinished t
|
|
|
|
| ImportUnfinished
|
|
|
|
|
2023-05-31 19:45:23 +00:00
|
|
|
data Diffed t
|
|
|
|
= DiffChanged t
|
|
|
|
| DiffRemoved
|
2023-06-01 17:46:16 +00:00
|
|
|
deriving (Eq)
|
|
|
|
|
|
|
|
data Imported
|
|
|
|
= ImportedFull (ImportableContentsChunkable Annex (Either Sha Key))
|
|
|
|
| ImportedDiff LastImportedTree (ImportableContents (Diffed (Either Sha Key)))
|
|
|
|
|
|
|
|
newtype LastImportedTree = LastImportedTree Sha
|
2023-05-31 19:45:23 +00:00
|
|
|
|
2023-06-01 17:46:16 +00:00
|
|
|
{- Diffs between the previous and current ContentIdentifier trees, and
|
2023-05-31 19:45:23 +00:00
|
|
|
- runs importKeys on only the changed files.
|
|
|
|
-
|
|
|
|
- This will download the same content as if importKeys were run on all
|
|
|
|
- files, but this speeds it up significantly when there are a lot of files
|
|
|
|
- and only a few have changed. importKeys has to look up each
|
|
|
|
- ContentIdentifier to see if a Key is known for it. This avoids doing
|
|
|
|
- that lookup on files that have not changed.
|
|
|
|
-
|
|
|
|
- Diffing is not currently implemented when there is a History.
|
|
|
|
-}
|
|
|
|
importChanges
|
|
|
|
:: Remote
|
|
|
|
-> ImportTreeConfig
|
|
|
|
-> Bool
|
|
|
|
-> Bool
|
|
|
|
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
|
2023-06-01 17:46:16 +00:00
|
|
|
-> Annex (ImportResult Imported)
|
2023-05-31 19:45:23 +00:00
|
|
|
importChanges remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
|
|
|
|
((History currcidtree currhistory), cidtreemap) <- buildContentIdentifierTree importablecontents
|
|
|
|
-- diffimport below does not handle history, so when there is
|
|
|
|
-- history, do a full import.
|
|
|
|
if not (S.null currhistory)
|
|
|
|
then fullimport currcidtree
|
|
|
|
else do
|
|
|
|
getContentIdentifierTree (Remote.uuid remote) >>= \case
|
|
|
|
Nothing -> fullimport currcidtree
|
2023-06-01 17:46:16 +00:00
|
|
|
Just prevcidtree -> candiffimport prevcidtree >>= \case
|
|
|
|
Nothing -> fullimport currcidtree
|
|
|
|
Just lastimportedtree -> diffimport cidtreemap prevcidtree currcidtree lastimportedtree
|
2023-05-31 19:45:23 +00:00
|
|
|
where
|
|
|
|
remember = recordContentIdentifierTree (Remote.uuid remote)
|
|
|
|
|
2023-06-01 17:46:16 +00:00
|
|
|
-- In order to use a diff, the previous ContentIdentifier tree must
|
|
|
|
-- not have been garbage collected. Which can happen since there
|
|
|
|
-- are no git refs to it.
|
|
|
|
--
|
|
|
|
-- Also, a tree must have been imported before, and that tree must
|
|
|
|
-- also have not been garbage collected (which is less likely to
|
|
|
|
-- happen due to the remote tracking branch).
|
|
|
|
candiffimport prevcidtree =
|
|
|
|
catObjectMetaData prevcidtree >>= \case
|
|
|
|
Nothing -> return Nothing
|
|
|
|
Just _ -> getLastImportedTree remote >>= \case
|
|
|
|
Nothing -> return Nothing
|
|
|
|
Just lastimported@(LastImportedTree t) ->
|
|
|
|
ifM (isJust <$> catObjectMetaData t)
|
|
|
|
( return (Just lastimported)
|
|
|
|
, return Nothing
|
|
|
|
)
|
|
|
|
|
2023-05-31 19:45:23 +00:00
|
|
|
fullimport currcidtree =
|
|
|
|
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents >>= \case
|
|
|
|
ImportUnfinished -> return ImportUnfinished
|
|
|
|
ImportFinished r -> do
|
|
|
|
remember currcidtree
|
2023-06-01 17:46:16 +00:00
|
|
|
return $ ImportFinished $ ImportedFull r
|
|
|
|
|
|
|
|
diffimport cidtreemap prevcidtree currcidtree lastimportedtree = do
|
|
|
|
(diff, cleanup) <- inRepo $ Git.DiffTree.diffTreeRecursive
|
|
|
|
prevcidtree
|
|
|
|
currcidtree
|
|
|
|
let (removed, changed) = partition isremoval diff
|
2023-05-31 19:45:23 +00:00
|
|
|
let mkicchanged ti = do
|
|
|
|
v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
|
|
|
|
return (mkloc ti, v)
|
|
|
|
let ic = ImportableContentsComplete $ ImportableContents
|
2023-06-01 17:46:16 +00:00
|
|
|
{ importableContents = mapMaybe mkicchanged changed
|
|
|
|
, importableHistory = []
|
|
|
|
}
|
2023-05-31 19:45:23 +00:00
|
|
|
importKeys remote importtreeconfig importcontent thirdpartypopulated ic >>= \case
|
|
|
|
ImportUnfinished -> do
|
|
|
|
void $ liftIO cleanup
|
|
|
|
return ImportUnfinished
|
2023-06-01 17:46:16 +00:00
|
|
|
ImportFinished (ImportableContentsComplete ic') ->
|
|
|
|
liftIO cleanup >>= \case
|
|
|
|
False -> return ImportUnfinished
|
|
|
|
True -> do
|
|
|
|
remember currcidtree
|
|
|
|
return $ ImportFinished $
|
|
|
|
ImportedDiff lastimportedtree
|
|
|
|
(mkdiff ic' removed)
|
2023-05-31 19:45:23 +00:00
|
|
|
-- importKeys is not passed ImportableContentsChunked
|
|
|
|
-- above, so it cannot return it
|
|
|
|
ImportFinished (ImportableContentsChunked {}) -> error "internal"
|
2023-06-01 17:46:16 +00:00
|
|
|
|
|
|
|
isremoval ti = Git.DiffTree.dstsha ti `elem` nullShas
|
|
|
|
|
|
|
|
mkloc = mkImportLocation . getTopFilePath . Git.DiffTree.file
|
2023-05-31 19:45:23 +00:00
|
|
|
|
2023-06-01 17:46:16 +00:00
|
|
|
mkdiff ic removed = ImportableContents
|
|
|
|
{ importableContents = diffremoved ++ diffchanged
|
|
|
|
, importableHistory = []
|
|
|
|
}
|
|
|
|
where
|
|
|
|
diffchanged = map
|
|
|
|
(\(loc, v) -> (loc, DiffChanged v))
|
|
|
|
(importableContents ic)
|
|
|
|
diffremoved = map
|
|
|
|
(\ti -> (mkloc ti, DiffRemoved))
|
|
|
|
removed
|
|
|
|
|
|
|
|
{- Gets the tree that was last imported from the remote
|
|
|
|
- (or exported to it if an export happened after the last import).
|
|
|
|
-}
|
|
|
|
getLastImportedTree :: Remote -> Annex (Maybe LastImportedTree)
|
|
|
|
getLastImportedTree remote = do
|
|
|
|
db <- Export.openDb (Remote.uuid remote)
|
|
|
|
mtree <- liftIO $ Export.getExportTreeCurrent db
|
|
|
|
Export.closeDb db
|
|
|
|
return (LastImportedTree <$> mtree)
|
2023-05-31 19:45:23 +00:00
|
|
|
|
2020-07-03 17:41:57 +00:00
|
|
|
{- Downloads all new ContentIdentifiers, or when importcontent is False,
|
|
|
|
- generates Keys without downloading.
|
2019-02-26 19:25:28 +00:00
|
|
|
-
|
2020-07-03 17:41:57 +00:00
|
|
|
- Generates either a Key or a git Sha, depending on annex.largefiles.
|
|
|
|
- But when importcontent is False, it cannot match on annex.largefiles
|
|
|
|
- (or generate a git Sha), so always generates Keys.
|
|
|
|
-
|
|
|
|
- Supports concurrency when enabled.
|
|
|
|
-
|
|
|
|
- Note that, when a ContentIdentifier has been imported before,
|
|
|
|
- generates the same thing that was imported before, so annex.largefiles
|
|
|
|
- is not reapplied.
|
2019-02-26 19:25:28 +00:00
|
|
|
-}
|
2020-07-03 17:41:57 +00:00
|
|
|
importKeys
|
|
|
|
:: Remote
|
|
|
|
-> ImportTreeConfig
|
|
|
|
-> Bool
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
-> Bool
|
2021-10-06 21:05:32 +00:00
|
|
|
-> ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)
|
2023-05-31 19:45:23 +00:00
|
|
|
-> Annex (ImportResult (ImportableContentsChunkable Annex (Either Sha Key)))
|
2020-12-21 20:03:27 +00:00
|
|
|
importKeys remote importtreeconfig importcontent thirdpartypopulated importablecontents = do
|
2020-09-28 19:29:08 +00:00
|
|
|
unless (canImportKeys remote importcontent) $
|
2020-07-03 17:41:57 +00:00
|
|
|
giveup "This remote does not support importing without downloading content."
|
2019-02-26 19:25:28 +00:00
|
|
|
-- This map is used to remember content identifiers that
|
2020-07-03 17:41:57 +00:00
|
|
|
-- were just imported, before they have necessarily been
|
2019-02-26 19:25:28 +00:00
|
|
|
-- stored in the database. This way, if the same content
|
|
|
|
-- identifier appears multiple times in the
|
|
|
|
-- importablecontents (eg when it has a history),
|
2020-07-03 17:41:57 +00:00
|
|
|
-- they will only be imported once.
|
2019-02-26 19:25:28 +00:00
|
|
|
cidmap <- liftIO $ newTVarIO M.empty
|
2019-03-08 16:33:44 +00:00
|
|
|
-- When concurrency is enabled, this set is needed to
|
2020-07-03 17:41:57 +00:00
|
|
|
-- avoid two threads both importing the same content identifier.
|
|
|
|
importing <- liftIO $ newTVarIO S.empty
|
2021-10-06 21:05:32 +00:00
|
|
|
withciddb $ \db -> do
|
2023-06-02 17:30:30 +00:00
|
|
|
db' <- CIDDb.needsUpdateFromLog db
|
|
|
|
>>= maybe (pure db) (CIDDb.updateFromLog db)
|
|
|
|
(prepclock (run cidmap importing db'))
|
2019-02-26 19:25:28 +00:00
|
|
|
where
|
2020-12-23 19:21:33 +00:00
|
|
|
-- When not importing content, reuse the same vector
|
|
|
|
-- clock for all state that's recorded. This can save
|
|
|
|
-- a little bit of disk space. Individual file downloads
|
|
|
|
-- while downloading take too long for this optimisation
|
|
|
|
-- to be safe to do.
|
2021-10-06 21:05:32 +00:00
|
|
|
prepclock a
|
2020-12-23 19:21:33 +00:00
|
|
|
| importcontent = a
|
|
|
|
| otherwise = reuseVectorClockWhile a
|
|
|
|
|
2022-08-11 20:57:44 +00:00
|
|
|
withciddb a = do
|
|
|
|
cidlck <- calcRepo' gitAnnexContentIdentifierLock
|
|
|
|
withExclusiveLock cidlck $
|
|
|
|
bracket CIDDb.openDb CIDDb.closeDb a
|
2021-10-06 21:05:32 +00:00
|
|
|
|
|
|
|
run cidmap importing db = do
|
2020-06-23 20:07:18 +00:00
|
|
|
largematcher <- largeFilesMatcher
|
2021-10-06 21:05:32 +00:00
|
|
|
case importablecontents of
|
|
|
|
ImportableContentsComplete ic ->
|
|
|
|
go False largematcher cidmap importing db ic >>= return . \case
|
2023-05-31 19:45:23 +00:00
|
|
|
Nothing -> ImportUnfinished
|
|
|
|
Just v -> ImportFinished $ ImportableContentsComplete v
|
2021-10-06 21:05:32 +00:00
|
|
|
ImportableContentsChunked {} -> do
|
|
|
|
c <- gochunked db (importableContentsChunk importablecontents)
|
|
|
|
gohistory largematcher cidmap importing db (importableHistoryComplete importablecontents) >>= return . \case
|
2023-05-31 19:45:23 +00:00
|
|
|
Nothing -> ImportUnfinished
|
|
|
|
Just h -> ImportFinished $ ImportableContentsChunked
|
2021-10-06 21:05:32 +00:00
|
|
|
{ importableContentsChunk = c
|
|
|
|
, importableHistoryComplete = h
|
|
|
|
}
|
|
|
|
|
|
|
|
go oldversion largematcher cidmap importing db (ImportableContents l h) = do
|
2019-03-08 16:33:44 +00:00
|
|
|
jobs <- forM l $ \i ->
|
2020-12-21 20:03:27 +00:00
|
|
|
if thirdpartypopulated
|
2021-10-06 21:05:32 +00:00
|
|
|
then Left <$> thirdpartypopulatedimport db i
|
2020-12-21 20:03:27 +00:00
|
|
|
else startimport cidmap importing db i oldversion largematcher
|
2019-03-08 16:33:44 +00:00
|
|
|
l' <- liftIO $ forM jobs $
|
|
|
|
either pure (atomically . takeTMVar)
|
2019-02-26 19:25:28 +00:00
|
|
|
if any isNothing l'
|
|
|
|
then return Nothing
|
2021-10-06 21:05:32 +00:00
|
|
|
else gohistory largematcher cidmap importing db h >>= return . \case
|
|
|
|
Nothing -> Nothing
|
|
|
|
Just h' -> Just $ ImportableContents (catMaybes l') h'
|
|
|
|
|
|
|
|
gohistory largematcher cidmap importing db h = do
|
|
|
|
h' <- mapM (go True largematcher cidmap importing db) h
|
|
|
|
if any isNothing h'
|
|
|
|
then return Nothing
|
|
|
|
else return $ Just $ catMaybes h'
|
2019-02-26 19:25:28 +00:00
|
|
|
|
2021-10-06 21:05:32 +00:00
|
|
|
gochunked db c
|
|
|
|
-- Downloading cannot be done when chunked, since only
|
|
|
|
-- the first chunk is processed before returning.
|
2023-04-10 17:38:14 +00:00
|
|
|
| importcontent = giveup "importKeys does not support downloading chunked import"
|
2021-10-06 21:05:32 +00:00
|
|
|
-- Chunked import is currently only used by thirdpartypopulated
|
|
|
|
-- remotes.
|
2023-04-10 17:38:14 +00:00
|
|
|
| not thirdpartypopulated = giveup "importKeys does not support chunked import when not thirdpartypopulated"
|
2021-10-06 21:05:32 +00:00
|
|
|
| otherwise = do
|
|
|
|
l <- forM (importableContentsSubTree c) $ \(loc, i) -> do
|
|
|
|
let loc' = importableContentsChunkFullLocation (importableContentsSubDir c) loc
|
|
|
|
thirdpartypopulatedimport db (loc', i) >>= return . \case
|
|
|
|
Just (_loc, k) -> Just (loc, k)
|
|
|
|
Nothing -> Nothing
|
|
|
|
return $ ImportableContentsChunk
|
|
|
|
{ importableContentsSubDir = importableContentsSubDir c
|
|
|
|
, importableContentsSubTree = catMaybes l
|
|
|
|
, importableContentsNextChunk =
|
|
|
|
importableContentsNextChunk c >>= \case
|
|
|
|
Nothing -> return Nothing
|
|
|
|
Just c' -> withciddb $ \db' ->
|
|
|
|
prepclock $
|
|
|
|
Just <$> gochunked db' c'
|
|
|
|
}
|
|
|
|
|
2020-07-03 17:41:57 +00:00
|
|
|
waitstart importing cid = liftIO $ atomically $ do
|
|
|
|
s <- readTVar importing
|
2019-03-08 16:33:44 +00:00
|
|
|
if S.member cid s
|
|
|
|
then retry
|
2020-07-03 17:41:57 +00:00
|
|
|
else writeTVar importing $ S.insert cid s
|
2019-03-08 16:33:44 +00:00
|
|
|
|
2020-07-03 17:41:57 +00:00
|
|
|
signaldone importing cid = liftIO $ atomically $ do
|
|
|
|
s <- readTVar importing
|
|
|
|
writeTVar importing $ S.delete cid s
|
2019-03-08 16:33:44 +00:00
|
|
|
|
2020-07-03 17:41:57 +00:00
|
|
|
startimport cidmap importing db i@(loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case
|
2020-06-23 20:07:18 +00:00
|
|
|
(k:ks) ->
|
|
|
|
-- If the same content was imported before
|
2023-03-14 02:39:16 +00:00
|
|
|
-- yielding multiple different keys, it's not clear
|
2020-06-23 20:07:18 +00:00
|
|
|
-- which is best to use this time, so pick the
|
|
|
|
-- first in the list. But, if any of them is a
|
|
|
|
-- git sha, use it, because the content must
|
|
|
|
-- be included in the git repo then.
|
|
|
|
let v = case mapMaybe keyGitSha (k:ks) of
|
|
|
|
(sha:_) -> Left sha
|
|
|
|
[] -> Right k
|
|
|
|
in return $ Left $ Just (loc, v)
|
2019-03-08 16:33:44 +00:00
|
|
|
[] -> do
|
|
|
|
job <- liftIO $ newEmptyTMVarIO
|
2023-04-08 19:48:32 +00:00
|
|
|
let ai = ActionItemOther (Just (QuotedPath (fromImportLocation loc)))
|
2020-09-14 20:49:33 +00:00
|
|
|
let si = SeekInput []
|
|
|
|
let importaction = starting ("import " ++ Remote.name remote) ai si $ do
|
2019-04-19 19:05:08 +00:00
|
|
|
when oldversion $
|
|
|
|
showNote "old version"
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
tryNonAsync (importordownload cidmap i largematcher) >>= \case
|
2019-03-08 16:33:44 +00:00
|
|
|
Left e -> next $ do
|
filter out control characters in warning messages
Converted warning and similar to use StringContainingQuotedPath. Most
warnings are static strings, some do refer to filepaths that need to be
quoted, and others don't need quoting.
Note that, since quote filters out control characters of even
UnquotedString, this makes all warnings safe, even when an attacker
sneaks in a control character in some other way.
When json is being output, no quoting is done, since json gets its own
quoting.
This does, as a side effect, make warning messages in json output not
be indented. The indentation is only needed to offset warning messages
underneath the display of the file they apply to, so that's ok.
Sponsored-by: Brett Eisenberg on Patreon
2023-04-10 18:47:32 +00:00
|
|
|
warning (UnquotedString (show e))
|
2019-03-08 16:33:44 +00:00
|
|
|
liftIO $ atomically $
|
|
|
|
putTMVar job Nothing
|
|
|
|
return False
|
|
|
|
Right r -> next $ do
|
|
|
|
liftIO $ atomically $
|
|
|
|
putTMVar job r
|
|
|
|
return True
|
|
|
|
commandAction $ bracket_
|
2020-07-03 17:41:57 +00:00
|
|
|
(waitstart importing cid)
|
|
|
|
(signaldone importing cid)
|
|
|
|
importaction
|
2019-03-08 16:33:44 +00:00
|
|
|
return (Right job)
|
|
|
|
|
2021-10-06 21:05:32 +00:00
|
|
|
thirdpartypopulatedimport db (loc, (cid, sz)) =
|
2020-12-21 20:03:27 +00:00
|
|
|
case Remote.importKey ia of
|
2021-10-06 21:05:32 +00:00
|
|
|
Nothing -> return Nothing
|
2020-12-21 20:03:27 +00:00
|
|
|
Just importkey ->
|
|
|
|
tryNonAsync (importkey loc cid sz nullMeterUpdate) >>= \case
|
|
|
|
Right (Just k) -> do
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkeyindb db cid k
|
2024-08-23 20:35:12 +00:00
|
|
|
logChange NoLiveUpdate k (Remote.uuid remote) InfoPresent
|
2021-10-06 21:05:32 +00:00
|
|
|
return $ Just (loc, Right k)
|
|
|
|
Right Nothing -> return Nothing
|
2020-12-21 20:03:27 +00:00
|
|
|
Left e -> do
|
filter out control characters in warning messages
Converted warning and similar to use StringContainingQuotedPath. Most
warnings are static strings, some do refer to filepaths that need to be
quoted, and others don't need quoting.
Note that, since quote filters out control characters of even
UnquotedString, this makes all warnings safe, even when an attacker
sneaks in a control character in some other way.
When json is being output, no quoting is done, since json gets its own
quoting.
This does, as a side effect, make warning messages in json output not
be indented. The indentation is only needed to offset warning messages
underneath the display of the file they apply to, so that's ok.
Sponsored-by: Brett Eisenberg on Patreon
2023-04-10 18:47:32 +00:00
|
|
|
warning (UnquotedString (show e))
|
2021-10-06 21:05:32 +00:00
|
|
|
return Nothing
|
2020-12-21 20:03:27 +00:00
|
|
|
|
2023-07-26 18:34:21 +00:00
|
|
|
importordownload cidmap (loc, (cid, sz)) largematcher = do
|
2020-09-28 19:03:15 +00:00
|
|
|
f <- locworktreefile loc
|
2020-11-04 18:20:37 +00:00
|
|
|
matcher <- largematcher f
|
2020-09-28 19:03:15 +00:00
|
|
|
-- When importing a key is supported, always use it rather
|
|
|
|
-- than downloading and retrieving a key, to avoid
|
|
|
|
-- generating trees with different keys for the same content.
|
|
|
|
let act = if importcontent
|
|
|
|
then case Remote.importKey ia of
|
|
|
|
Nothing -> dodownload
|
2023-07-26 18:34:21 +00:00
|
|
|
Just _ -> if Utility.Matcher.introspect matchNeedsFileContent (fst matcher)
|
2020-09-28 19:03:15 +00:00
|
|
|
then dodownload
|
|
|
|
else doimport
|
|
|
|
else doimport
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
act cidmap (loc, (cid, sz)) f matcher
|
2020-07-03 17:41:57 +00:00
|
|
|
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
doimport cidmap (loc, (cid, sz)) f matcher =
|
2020-07-03 17:41:57 +00:00
|
|
|
case Remote.importKey ia of
|
|
|
|
Nothing -> error "internal" -- checked earlier
|
2020-09-28 17:22:16 +00:00
|
|
|
Just importkey -> do
|
2023-07-26 18:34:21 +00:00
|
|
|
when (Utility.Matcher.introspect matchNeedsFileContent (fst matcher)) $
|
2020-09-28 17:22:16 +00:00
|
|
|
giveup "annex.largefiles configuration examines file contents, so cannot import without content."
|
2020-09-28 19:03:15 +00:00
|
|
|
let mi = MatchingInfo ProvidedInfo
|
2021-03-02 16:47:23 +00:00
|
|
|
{ providedFilePath = Just f
|
2020-09-28 17:22:16 +00:00
|
|
|
, providedKey = Nothing
|
2021-03-02 16:47:23 +00:00
|
|
|
, providedFileSize = Just sz
|
2020-09-28 17:22:16 +00:00
|
|
|
, providedMimeType = Nothing
|
|
|
|
, providedMimeEncoding = Nothing
|
2021-03-02 16:47:23 +00:00
|
|
|
, providedLinkType = Nothing
|
2020-09-28 17:22:16 +00:00
|
|
|
}
|
2024-08-23 20:35:12 +00:00
|
|
|
islargefile <- checkMatcher' matcher mi NoLiveUpdate mempty
|
bwlimit
Added annex.bwlimit and remote.name.annex-bwlimit config that works for git
remotes and many but not all special remotes.
This nearly works, at least for a git remote on the same disk. With it set
to 100kb/1s, the meter displays an actual bandwidth of 128 kb/s, with
occasional spikes to 160 kb/s. So it needs to delay just a bit longer...
I'm unsure why.
However, at the beginning a lot of data flows before it determines the
right bandwidth limit. A granularity of less than 1s would probably improve
that.
And, I don't know yet if it makes sense to have it be 100ks/1s rather than
100kb/s. Is there a situation where the user would want a larger
granularity? Does granulatity need to be configurable at all? I only used that
format for the config really in order to reuse an existing parser.
This can't support for external special remotes, or for ones that
themselves shell out to an external command. (Well, it could, but it
would involve pausing and resuming the child process tree, which seems
very hard to implement and very strange besides.) There could also be some
built-in special remotes that it still doesn't work for, due to them not
having a progress meter whose displays blocks the bandwidth using thread.
But I don't think there are actually any that run a separate thread for
downloads than the thread that displays the progress meter.
Sponsored-by: Graham Spencer on Patreon
2021-09-21 20:58:02 +00:00
|
|
|
metered Nothing sz bwlimit $ const $ if islargefile
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
then doimportlarge importkey cidmap loc cid sz f
|
|
|
|
else doimportsmall cidmap loc cid sz
|
2020-09-28 17:22:16 +00:00
|
|
|
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
doimportlarge importkey cidmap loc cid sz f p =
|
2020-09-28 17:22:16 +00:00
|
|
|
tryNonAsync importer >>= \case
|
2020-09-28 19:03:15 +00:00
|
|
|
Right (Just (k, True)) -> return $ Just (loc, Right k)
|
|
|
|
Right _ -> return Nothing
|
2020-09-28 17:22:16 +00:00
|
|
|
Left e -> do
|
filter out control characters in warning messages
Converted warning and similar to use StringContainingQuotedPath. Most
warnings are static strings, some do refer to filepaths that need to be
quoted, and others don't need quoting.
Note that, since quote filters out control characters of even
UnquotedString, this makes all warnings safe, even when an attacker
sneaks in a control character in some other way.
When json is being output, no quoting is done, since json gets its own
quoting.
This does, as a side effect, make warning messages in json output not
be indented. The indentation is only needed to offset warning messages
underneath the display of the file they apply to, so that's ok.
Sponsored-by: Brett Eisenberg on Patreon
2023-04-10 18:47:32 +00:00
|
|
|
warning (UnquotedString (show e))
|
2020-09-28 17:22:16 +00:00
|
|
|
return Nothing
|
|
|
|
where
|
|
|
|
importer = do
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
-- Don't display progress when generating
|
|
|
|
-- key, if the content will later be
|
|
|
|
-- downloaded, which is a more expensive
|
|
|
|
-- operation generally.
|
|
|
|
let p' = if importcontent then nullMeterUpdate else p
|
2020-12-18 20:52:49 +00:00
|
|
|
importkey loc cid sz p' >>= \case
|
add thirdPartyPopulated interface
This is to support, eg a borg repo as a special remote, which is
populated not by running git-annex commands, but by using borg. Then
git-annex sync lists the content of the remote, learns which files are
annex objects, and treats those as present in the remote.
So, most of the import machinery is reused, to a new purpose. While
normally importtree maintains a remote tracking branch, this does not,
because the files stored in the remote are annex object files, not
user-visible filenames. But, internally, a git tree is still generated,
of the files on the remote that are annex objects. This tree is used
by retrieveExportWithContentIdentifier, etc. As with other import/export
remotes, that the tree is recorded in the export log, and gets grafted
into the git-annex branch.
importKey changed to be able to return Nothing, to indicate when an
ImportLocation is not an annex object and so should be skipped from
being included in the tree.
It did not seem to make sense to have git-annex import do this, since
from the user's perspective, it's not like other imports. So only
git-annex sync does it.
Note that, git-annex sync does not yet download objects from such
remotes that are preferred content. importKeys is run with
content downloading disabled, to avoid getting the content of all
objects. Perhaps what's needed is for seekSyncContent to be run with these
remotes, but I don't know if it will just work (in particular, it needs
to avoid trying to transfer objects to them), so I skipped that for now.
(Untested and unused as of yet.)
This commit was sponsored by Jochen Bartl on Patreon.
2020-12-18 18:52:57 +00:00
|
|
|
Nothing -> return Nothing
|
2020-12-21 20:03:27 +00:00
|
|
|
Just k -> checkSecureHashes k >>= \case
|
|
|
|
Nothing -> do
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkey cidmap cid k
|
2024-08-23 20:35:12 +00:00
|
|
|
logChange NoLiveUpdate k (Remote.uuid remote) InfoPresent
|
2020-12-21 20:03:27 +00:00
|
|
|
if importcontent
|
|
|
|
then getcontent k
|
|
|
|
else return (Just (k, True))
|
|
|
|
Just msg -> giveup (msg ++ " to import")
|
|
|
|
|
2020-09-28 19:03:15 +00:00
|
|
|
getcontent :: Key -> Annex (Maybe (Key, Bool))
|
|
|
|
getcontent k = do
|
|
|
|
let af = AssociatedFile (Just f)
|
|
|
|
let downloader p' tmpfile = do
|
2022-05-09 19:38:21 +00:00
|
|
|
_ <- Remote.retrieveExportWithContentIdentifier
|
change retrieveExportWithContentIdentifier to take a list of ContentIdentifier
This partly fixes an issue where there are duplicate files in the
special remote, and the first file gets swapped with another duplicate,
or deleted. The swap case is fixed by this, the deleted case will need
other changes.
This makes retrieveExportWithContentIdentifier take a list of allowed
ContentIdentifier, same as storeExportWithContentIdentifier,
removeExportWithContentIdentifier, and
checkPresentExportWithContentIdentifier.
Of the special remotes that support importtree, borg is a special case
and does not use content identifiers, S3 I assume can't get mixed up
like this, directory certainly has the problem, and adb also appears to
have had the problem.
Sponsored-by: Graham Spencer on Patreon
2022-09-20 17:15:31 +00:00
|
|
|
ia loc [cid] (fromRawFilePath tmpfile)
|
2022-05-09 19:38:21 +00:00
|
|
|
(Left k)
|
2020-09-28 19:03:15 +00:00
|
|
|
(combineMeterUpdate p' p)
|
2022-05-09 19:38:21 +00:00
|
|
|
ok <- moveAnnex k af tmpfile
|
2020-09-28 19:03:15 +00:00
|
|
|
when ok $
|
2024-08-23 20:35:12 +00:00
|
|
|
logStatus NoLiveUpdate k InfoPresent
|
2022-05-09 19:38:21 +00:00
|
|
|
return (Just (k, ok))
|
disk free checking for unsized keys
Improve disk free space checking when transferring unsized keys to
local git remotes. Since the size of the object file is known, can
check that instead.
Getting unsized keys from local git remotes does not check the actual
object size. It would be harder to handle that direction because the size
check is run locally, before anything involving the remote is done. So it
doesn't know the size of the file on the remote.
Also, transferring unsized keys to other remotes, including ssh remotes and
p2p remotes don't do disk size checking for unsized keys. This would need a
change in protocol.
(It does seem like it would be possible to implement the same thing for
directory special remotes though.)
In some sense, it might be better to not ever do disk free checking for
unsized keys, than to do it only sometimes. A user might notice this
direction working and consider it a bug that the other direction does not.
On the other hand, disk reserve checking is not implemented for most
special remotes at all, and yet it is implemented for a few, which is also
inconsistent, but best effort. And so doing this best effort seems to make
some sense. Fundamentally, if the user wants the size to always be checked,
they should not use unsized keys.
Sponsored-by: Brock Spratlen on Patreon
2024-01-16 18:29:10 +00:00
|
|
|
checkDiskSpaceToGet k Nothing Nothing $
|
2020-09-28 19:03:15 +00:00
|
|
|
notifyTransfer Download af $
|
2021-02-03 19:35:32 +00:00
|
|
|
download' (Remote.uuid remote) k af Nothing stdRetry $ \p' ->
|
2020-09-28 19:03:15 +00:00
|
|
|
withTmp k $ downloader p'
|
2020-09-28 17:22:16 +00:00
|
|
|
|
|
|
|
-- The file is small, so is added to git, so while importing
|
|
|
|
-- without content does not retrieve annexed files, it does
|
|
|
|
-- need to retrieve this file.
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
doimportsmall cidmap loc cid sz p = do
|
2020-09-28 17:22:16 +00:00
|
|
|
let downloader tmpfile = do
|
2022-05-09 19:38:21 +00:00
|
|
|
(k, _) <- Remote.retrieveExportWithContentIdentifier
|
change retrieveExportWithContentIdentifier to take a list of ContentIdentifier
This partly fixes an issue where there are duplicate files in the
special remote, and the first file gets swapped with another duplicate,
or deleted. The swap case is fixed by this, the deleted case will need
other changes.
This makes retrieveExportWithContentIdentifier take a list of allowed
ContentIdentifier, same as storeExportWithContentIdentifier,
removeExportWithContentIdentifier, and
checkPresentExportWithContentIdentifier.
Of the special remotes that support importtree, borg is a special case
and does not use content identifiers, S3 I assume can't get mixed up
like this, directory certainly has the problem, and adb also appears to
have had the problem.
Sponsored-by: Graham Spencer on Patreon
2022-09-20 17:15:31 +00:00
|
|
|
ia loc [cid] (fromRawFilePath tmpfile)
|
2022-05-09 19:38:21 +00:00
|
|
|
(Right (mkkey tmpfile))
|
2020-09-28 17:22:16 +00:00
|
|
|
p
|
|
|
|
case keyGitSha k of
|
|
|
|
Just sha -> do
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkey cidmap cid k
|
2020-09-28 19:03:15 +00:00
|
|
|
return sha
|
2020-09-28 17:22:16 +00:00
|
|
|
Nothing -> error "internal"
|
disk free checking for unsized keys
Improve disk free space checking when transferring unsized keys to
local git remotes. Since the size of the object file is known, can
check that instead.
Getting unsized keys from local git remotes does not check the actual
object size. It would be harder to handle that direction because the size
check is run locally, before anything involving the remote is done. So it
doesn't know the size of the file on the remote.
Also, transferring unsized keys to other remotes, including ssh remotes and
p2p remotes don't do disk size checking for unsized keys. This would need a
change in protocol.
(It does seem like it would be possible to implement the same thing for
directory special remotes though.)
In some sense, it might be better to not ever do disk free checking for
unsized keys, than to do it only sometimes. A user might notice this
direction working and consider it a bug that the other direction does not.
On the other hand, disk reserve checking is not implemented for most
special remotes at all, and yet it is implemented for a few, which is also
inconsistent, but best effort. And so doing this best effort seems to make
some sense. Fundamentally, if the user wants the size to always be checked,
they should not use unsized keys.
Sponsored-by: Brock Spratlen on Patreon
2024-01-16 18:29:10 +00:00
|
|
|
checkDiskSpaceToGet tmpkey Nothing Nothing $
|
2020-09-28 17:22:16 +00:00
|
|
|
withTmp tmpkey $ \tmpfile ->
|
|
|
|
tryNonAsync (downloader tmpfile) >>= \case
|
2020-09-28 19:03:15 +00:00
|
|
|
Right sha -> return $ Just (loc, Left sha)
|
2020-07-03 17:41:57 +00:00
|
|
|
Left e -> do
|
filter out control characters in warning messages
Converted warning and similar to use StringContainingQuotedPath. Most
warnings are static strings, some do refer to filepaths that need to be
quoted, and others don't need quoting.
Note that, since quote filters out control characters of even
UnquotedString, this makes all warnings safe, even when an attacker
sneaks in a control character in some other way.
When json is being output, no quoting is done, since json gets its own
quoting.
This does, as a side effect, make warning messages in json output not
be indented. The indentation is only needed to offset warning messages
underneath the display of the file they apply to, so that's ok.
Sponsored-by: Brett Eisenberg on Patreon
2023-04-10 18:47:32 +00:00
|
|
|
warning (UnquotedString (show e))
|
2020-07-03 17:41:57 +00:00
|
|
|
return Nothing
|
2020-09-28 17:22:16 +00:00
|
|
|
where
|
|
|
|
tmpkey = importKey cid sz
|
|
|
|
mkkey tmpfile = gitShaKey <$> hashFile tmpfile
|
2020-07-03 17:41:57 +00:00
|
|
|
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
dodownload cidmap (loc, (cid, sz)) f matcher = do
|
2020-09-04 17:49:57 +00:00
|
|
|
let af = AssociatedFile (Just f)
|
2020-05-15 16:51:09 +00:00
|
|
|
let downloader tmpfile p = do
|
2022-05-09 19:38:21 +00:00
|
|
|
(k, _) <- Remote.retrieveExportWithContentIdentifier
|
change retrieveExportWithContentIdentifier to take a list of ContentIdentifier
This partly fixes an issue where there are duplicate files in the
special remote, and the first file gets swapped with another duplicate,
or deleted. The swap case is fixed by this, the deleted case will need
other changes.
This makes retrieveExportWithContentIdentifier take a list of allowed
ContentIdentifier, same as storeExportWithContentIdentifier,
removeExportWithContentIdentifier, and
checkPresentExportWithContentIdentifier.
Of the special remotes that support importtree, borg is a special case
and does not use content identifiers, S3 I assume can't get mixed up
like this, directory certainly has the problem, and adb also appears to
have had the problem.
Sponsored-by: Graham Spencer on Patreon
2022-09-20 17:15:31 +00:00
|
|
|
ia loc [cid] (fromRawFilePath tmpfile)
|
2022-05-09 19:38:21 +00:00
|
|
|
(Right (mkkey tmpfile))
|
2020-09-04 17:49:57 +00:00
|
|
|
p
|
2020-06-23 20:07:18 +00:00
|
|
|
case keyGitSha k of
|
|
|
|
Nothing -> do
|
2020-11-16 18:09:55 +00:00
|
|
|
ok <- moveAnnex k af tmpfile
|
2020-06-23 20:07:18 +00:00
|
|
|
when ok $ do
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkey cidmap cid k
|
2024-08-23 20:35:12 +00:00
|
|
|
logStatus NoLiveUpdate k InfoPresent
|
|
|
|
logChange NoLiveUpdate k (Remote.uuid remote) InfoPresent
|
2020-06-23 20:07:18 +00:00
|
|
|
return (Right k, ok)
|
|
|
|
Just sha -> do
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkey cidmap cid k
|
2020-06-23 20:07:18 +00:00
|
|
|
return (Left sha, True)
|
2020-05-15 16:51:09 +00:00
|
|
|
let rundownload tmpfile p = tryNonAsync (downloader tmpfile p) >>= \case
|
2020-06-23 20:07:18 +00:00
|
|
|
Right (v, True) -> return $ Just (loc, v)
|
2020-05-15 16:51:09 +00:00
|
|
|
Right (_, False) -> return Nothing
|
|
|
|
Left e -> do
|
filter out control characters in warning messages
Converted warning and similar to use StringContainingQuotedPath. Most
warnings are static strings, some do refer to filepaths that need to be
quoted, and others don't need quoting.
Note that, since quote filters out control characters of even
UnquotedString, this makes all warnings safe, even when an attacker
sneaks in a control character in some other way.
When json is being output, no quoting is done, since json gets its own
quoting.
This does, as a side effect, make warning messages in json output not
be indented. The indentation is only needed to offset warning messages
underneath the display of the file they apply to, so that's ok.
Sponsored-by: Brett Eisenberg on Patreon
2023-04-10 18:47:32 +00:00
|
|
|
warning (UnquotedString (show e))
|
2020-05-15 16:51:09 +00:00
|
|
|
return Nothing
|
disk free checking for unsized keys
Improve disk free space checking when transferring unsized keys to
local git remotes. Since the size of the object file is known, can
check that instead.
Getting unsized keys from local git remotes does not check the actual
object size. It would be harder to handle that direction because the size
check is run locally, before anything involving the remote is done. So it
doesn't know the size of the file on the remote.
Also, transferring unsized keys to other remotes, including ssh remotes and
p2p remotes don't do disk size checking for unsized keys. This would need a
change in protocol.
(It does seem like it would be possible to implement the same thing for
directory special remotes though.)
In some sense, it might be better to not ever do disk free checking for
unsized keys, than to do it only sometimes. A user might notice this
direction working and consider it a bug that the other direction does not.
On the other hand, disk reserve checking is not implemented for most
special remotes at all, and yet it is implemented for a few, which is also
inconsistent, but best effort. And so doing this best effort seems to make
some sense. Fundamentally, if the user wants the size to always be checked,
they should not use unsized keys.
Sponsored-by: Brock Spratlen on Patreon
2024-01-16 18:29:10 +00:00
|
|
|
checkDiskSpaceToGet tmpkey Nothing Nothing $
|
2020-09-04 17:49:57 +00:00
|
|
|
notifyTransfer Download af $
|
2021-02-03 19:35:32 +00:00
|
|
|
download' (Remote.uuid remote) tmpkey af Nothing stdRetry $ \p ->
|
2020-09-04 17:49:57 +00:00
|
|
|
withTmp tmpkey $ \tmpfile ->
|
bwlimit
Added annex.bwlimit and remote.name.annex-bwlimit config that works for git
remotes and many but not all special remotes.
This nearly works, at least for a git remote on the same disk. With it set
to 100kb/1s, the meter displays an actual bandwidth of 128 kb/s, with
occasional spikes to 160 kb/s. So it needs to delay just a bit longer...
I'm unsure why.
However, at the beginning a lot of data flows before it determines the
right bandwidth limit. A granularity of less than 1s would probably improve
that.
And, I don't know yet if it makes sense to have it be 100ks/1s rather than
100kb/s. Is there a situation where the user would want a larger
granularity? Does granulatity need to be configurable at all? I only used that
format for the config really in order to reuse an existing parser.
This can't support for external special remotes, or for ones that
themselves shell out to an external command. (Well, it could, but it
would involve pausing and resuming the child process tree, which seems
very hard to implement and very strange besides.) There could also be some
built-in special remotes that it still doesn't work for, due to them not
having a progress meter whose displays blocks the bandwidth using thread.
But I don't think there are actually any that run a separate thread for
downloads than the thread that displays the progress meter.
Sponsored-by: Graham Spencer on Patreon
2021-09-21 20:58:02 +00:00
|
|
|
metered (Just p) tmpkey bwlimit $
|
2020-09-04 17:49:57 +00:00
|
|
|
const (rundownload tmpfile)
|
2019-02-27 17:15:02 +00:00
|
|
|
where
|
|
|
|
tmpkey = importKey cid sz
|
2020-09-04 17:49:57 +00:00
|
|
|
|
2020-09-28 19:03:15 +00:00
|
|
|
mkkey tmpfile = do
|
2020-09-28 17:22:16 +00:00
|
|
|
let mi = MatchingFile FileInfo
|
|
|
|
{ matchFile = f
|
2021-03-01 20:34:40 +00:00
|
|
|
, contentFile = tmpfile
|
2020-12-14 21:42:02 +00:00
|
|
|
, matchKey = Nothing
|
2020-09-28 17:22:16 +00:00
|
|
|
}
|
2024-08-23 20:35:12 +00:00
|
|
|
islargefile <- checkMatcher' matcher mi NoLiveUpdate mempty
|
2020-09-28 17:22:16 +00:00
|
|
|
if islargefile
|
|
|
|
then do
|
2020-11-04 18:20:37 +00:00
|
|
|
backend <- chooseBackend f
|
2020-09-28 17:22:16 +00:00
|
|
|
let ks = KeySource
|
|
|
|
{ keyFilename = f
|
2020-11-03 22:34:27 +00:00
|
|
|
, contentLocation = tmpfile
|
2020-09-28 17:22:16 +00:00
|
|
|
, inodeCache = Nothing
|
|
|
|
}
|
|
|
|
fst <$> genKey ks nullMeterUpdate backend
|
|
|
|
else gitShaKey <$> hashFile tmpfile
|
2019-02-26 19:25:28 +00:00
|
|
|
|
2020-09-28 17:22:16 +00:00
|
|
|
ia = Remote.importActions remote
|
2024-01-19 19:14:26 +00:00
|
|
|
|
|
|
|
bwlimit = remoteAnnexBwLimitDownload (Remote.gitconfig remote)
|
|
|
|
<|> remoteAnnexBwLimit (Remote.gitconfig remote)
|
2019-02-27 17:15:02 +00:00
|
|
|
|
2020-09-04 17:49:57 +00:00
|
|
|
locworktreefile loc = fromRepo $ fromTopFilePath $ asTopFilePath $
|
|
|
|
case importtreeconfig of
|
|
|
|
ImportTree -> fromImportLocation loc
|
|
|
|
ImportSubTree subdir _ ->
|
|
|
|
getTopFilePath subdir P.</> fromImportLocation loc
|
2019-02-26 19:25:28 +00:00
|
|
|
|
|
|
|
getcidkey cidmap db cid = liftIO $
|
2023-06-02 17:30:30 +00:00
|
|
|
-- Avoiding querying the database when it's empty speeds up
|
|
|
|
-- the initial import.
|
|
|
|
if CIDDb.databaseIsEmpty db
|
|
|
|
then getcidkeymap cidmap cid
|
|
|
|
else CIDDb.getContentIdentifierKeys db rs cid >>= \case
|
|
|
|
[] -> getcidkeymap cidmap cid
|
|
|
|
l -> return l
|
|
|
|
|
|
|
|
getcidkeymap cidmap cid =
|
|
|
|
atomically $ maybeToList . M.lookup cid <$> readTVar cidmap
|
2019-02-26 19:25:28 +00:00
|
|
|
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkey cidmap cid k = do
|
2019-02-26 19:25:28 +00:00
|
|
|
liftIO $ atomically $ modifyTVar' cidmap $
|
|
|
|
M.insert cid k
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
-- Only record in log now; the database will be updated
|
|
|
|
-- later from the log, and the cidmap will be used for now.
|
|
|
|
recordcidkeyinlog cid k
|
|
|
|
|
|
|
|
recordcidkeyindb db cid k = do
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
liftIO $ CIDDb.recordContentIdentifier db rs cid k
|
avoid import writing to cidsdb initially
Speed up importing trees from special remotes somewhat by avoiding
redundant writes to sqlite database.
Before, import would write to both the git-annex branch and also to the
sqlite database. But then the next time it was run, needsUpdateFromLog
would see the branch had changed, so run updateFromLog, which would make
the same writes to the sqlite database a second time.
Now import writes only to the git-annex branch. The next time it's run,
needsUpdateFromLog sees that the branch has changed and so calls
updateFromLog, which updates the sqlite database.
Why defer the write to the sqlite database like this? It seems that it
could write to the database as it goes, and at the end call
recordAnnexBranchTree to indicate that the information in the git-annex
branch has all been written to the cidsdb. That would avoid the second
import doing extra work.
But, there could be other processes running at the same time, and one of
them may update the git-annex branch, eg merging a remote git-annex branch
into it. Any cids logs on that merged git-annex branch would not be
reflected in the cidsdb yet. If the import then called
recordAnnexBranchTree, the cidsdb would never get updated with that merged
information.
I don't think there's a good way to prevent, or to detect that situation.
So, it can't call recordAnnexBranchTree at the end. So it might as well
wait until the next run and do updateFromLog then. It could instead do
updateFromLog at the end, but it's going to check needsUpdateFromLog
at the beginning anyway.
Note that the database writes were queued, so there is already a cidmap
that is used to remember changes that the current process has made.
So, omitting database writes can't change the behavior of the current
process.
Also note that thirdpartypopulatedimport uses recordcidkeyindb, which
reflects what it already did. That code path does not use the cidmap,
but does not need to query it either. It might be possible to make that
code path also only update the git-annex branch and not the db, but I
haven't checked.
Sponsored-by: Noam Kremen on Patreon
2023-05-30 21:05:28 +00:00
|
|
|
recordcidkeyinlog cid k
|
|
|
|
|
|
|
|
recordcidkeyinlog cid k =
|
add RemoteStateHandle
This solves the problem of sameas remotes trampling over per-remote
state. Used for:
* per-remote state, of course
* per-remote metadata, also of course
* per-remote content identifiers, because two remote implementations
could in theory generate the same content identifier for two different
peices of content
While chunk logs are per-remote data, they don't use this, because the
number and size of chunks stored is a common property across sameas
remotes.
External special remote had a complication, where it was theoretically
possible for a remote to send SETSTATE or GETSTATE during INITREMOTE or
EXPORTSUPPORTED. Since the uuid of the remote is typically generate in
Remote.setup, it would only be possible to pass a Maybe
RemoteStateHandle into it, and it would otherwise have to construct its
own. Rather than go that route, I decided to send an ERROR in this case.
It seems unlikely that any existing external special remote will be
affected. They would have to make up a git-annex key, and set state for
some reason during INITREMOTE. I can imagine such a hack, but it doesn't
seem worth complicating the code in such an ugly way to support it.
Unfortunately, both TestRemote and Annex.Import needed the Remote
to have a new field added that holds its RemoteStateHandle.
2019-10-14 16:33:27 +00:00
|
|
|
CIDLog.recordContentIdentifier rs cid k
|
|
|
|
|
|
|
|
rs = Remote.remoteStateHandle remote
|
2019-02-27 17:15:02 +00:00
|
|
|
|
|
|
|
{- Temporary key used for import of a ContentIdentifier while downloading
|
|
|
|
- content, before generating its real key. -}
|
|
|
|
importKey :: ContentIdentifier -> Integer -> Key
|
2019-11-22 20:24:04 +00:00
|
|
|
importKey (ContentIdentifier cid) size = mkKey $ \k -> k
|
2020-06-11 20:07:36 +00:00
|
|
|
{ keyName = genKeyName (decodeBS cid)
|
2019-02-27 17:15:02 +00:00
|
|
|
, keyVariety = OtherKey "CID"
|
|
|
|
, keySize = Just size
|
|
|
|
}
|
2019-05-20 20:37:04 +00:00
|
|
|
|
|
|
|
{-- Export omits non-preferred content from the tree stored on the
|
|
|
|
-- remote. So the import will normally have that content
|
|
|
|
-- omitted (unless something else added files with the same names to the
|
|
|
|
-- special remote).
|
|
|
|
--
|
|
|
|
-- That presents a problem: Merging the imported tree would result
|
2019-05-21 18:38:00 +00:00
|
|
|
-- in deletion of the files that were excluded from export.
|
|
|
|
-- To avoid that happening, this adds them back to the imported tree.
|
2019-05-20 20:37:04 +00:00
|
|
|
--}
|
2019-05-21 18:38:00 +00:00
|
|
|
addBackExportExcluded :: Remote -> Sha -> Annex Sha
|
|
|
|
addBackExportExcluded remote importtree =
|
2019-05-20 20:37:04 +00:00
|
|
|
getExportExcluded (Remote.uuid remote) >>= \case
|
|
|
|
[] -> return importtree
|
|
|
|
excludedlist -> inRepo $
|
|
|
|
adjustTree
|
|
|
|
-- don't remove any
|
|
|
|
(pure . Just)
|
|
|
|
excludedlist
|
|
|
|
-- if something was imported with the same
|
|
|
|
-- name as a file that was previously
|
|
|
|
-- excluded from import, use what was imported
|
|
|
|
(\imported _excluded -> imported)
|
|
|
|
[]
|
|
|
|
importtree
|
2019-05-21 18:38:00 +00:00
|
|
|
|
|
|
|
{- Match the preferred content of the remote at import time.
|
|
|
|
-
|
|
|
|
- Only keyless tokens are supported, because the keys are not known
|
|
|
|
- until an imported file is downloaded, which is too late to bother
|
importtree: support preferred content expressions needing keys
When importing from a special remote, support preferred content expressions
that use terms that match on keys (eg "present", "copies=1"). Such terms
are ignored when importing, since the key is not known yet.
When "standard" or "groupwanted" is used, the terms in those
expressions also get pruned accordingly.
This does allow setting preferred content to "not (copies=1)" to make a
special remote into a "source" type of repository. Importing from it will
import all files. Then exporting to it will drop all files from it.
In the case of setting preferred content to "present", it's pruned on
import, so everything gets imported from it. Then on export, it's applied,
and everything in it is left on it, and no new content is exported to it.
Since the old behavior on these preferred content expressions was for
importtree to error out, there's no backwards compatability to worry about.
Except that sync/pull/etc will now import where before it errored out.
2023-12-18 20:27:26 +00:00
|
|
|
- excluding it from an import. So prunes any tokens in the preferred
|
|
|
|
- content expression that need keys.
|
2019-05-21 18:38:00 +00:00
|
|
|
-}
|
|
|
|
makeImportMatcher :: Remote -> Annex (Either String (FileMatcher Annex))
|
importtree: support preferred content expressions needing keys
When importing from a special remote, support preferred content expressions
that use terms that match on keys (eg "present", "copies=1"). Such terms
are ignored when importing, since the key is not known yet.
When "standard" or "groupwanted" is used, the terms in those
expressions also get pruned accordingly.
This does allow setting preferred content to "not (copies=1)" to make a
special remote into a "source" type of repository. Importing from it will
import all files. Then exporting to it will drop all files from it.
In the case of setting preferred content to "present", it's pruned on
import, so everything gets imported from it. Then on export, it's applied,
and everything in it is left on it, and no new content is exported to it.
Since the old behavior on these preferred content expressions was for
importtree to error out, there's no backwards compatability to worry about.
Except that sync/pull/etc will now import where before it errored out.
2023-12-18 20:27:26 +00:00
|
|
|
makeImportMatcher r = load preferredContentTokens >>= \case
|
2023-07-26 18:34:21 +00:00
|
|
|
Nothing -> return $ Right (matchAll, matcherdesc)
|
|
|
|
Just (Right v) -> return $ Right (v, matcherdesc)
|
importtree: support preferred content expressions needing keys
When importing from a special remote, support preferred content expressions
that use terms that match on keys (eg "present", "copies=1"). Such terms
are ignored when importing, since the key is not known yet.
When "standard" or "groupwanted" is used, the terms in those
expressions also get pruned accordingly.
This does allow setting preferred content to "not (copies=1)" to make a
special remote into a "source" type of repository. Importing from it will
import all files. Then exporting to it will drop all files from it.
In the case of setting preferred content to "present", it's pruned on
import, so everything gets imported from it. Then on export, it's applied,
and everything in it is left on it, and no new content is exported to it.
Since the old behavior on these preferred content expressions was for
importtree to error out, there's no backwards compatability to worry about.
Except that sync/pull/etc will now import where before it errored out.
2023-12-18 20:27:26 +00:00
|
|
|
Just (Left err) -> return $ Left err
|
2019-05-21 18:38:00 +00:00
|
|
|
where
|
importtree: support preferred content expressions needing keys
When importing from a special remote, support preferred content expressions
that use terms that match on keys (eg "present", "copies=1"). Such terms
are ignored when importing, since the key is not known yet.
When "standard" or "groupwanted" is used, the terms in those
expressions also get pruned accordingly.
This does allow setting preferred content to "not (copies=1)" to make a
special remote into a "source" type of repository. Importing from it will
import all files. Then exporting to it will drop all files from it.
In the case of setting preferred content to "present", it's pruned on
import, so everything gets imported from it. Then on export, it's applied,
and everything in it is left on it, and no new content is exported to it.
Since the old behavior on these preferred content expressions was for
importtree to error out, there's no backwards compatability to worry about.
Except that sync/pull/etc will now import where before it errored out.
2023-12-18 20:27:26 +00:00
|
|
|
load t = M.lookup (Remote.uuid r) . fst
|
|
|
|
<$> preferredRequiredMapsLoad' pruneImportMatcher t
|
2023-07-26 18:34:21 +00:00
|
|
|
matcherdesc = MatcherDesc "preferred content"
|
2019-05-21 18:38:00 +00:00
|
|
|
|
importtree: support preferred content expressions needing keys
When importing from a special remote, support preferred content expressions
that use terms that match on keys (eg "present", "copies=1"). Such terms
are ignored when importing, since the key is not known yet.
When "standard" or "groupwanted" is used, the terms in those
expressions also get pruned accordingly.
This does allow setting preferred content to "not (copies=1)" to make a
special remote into a "source" type of repository. Importing from it will
import all files. Then exporting to it will drop all files from it.
In the case of setting preferred content to "present", it's pruned on
import, so everything gets imported from it. Then on export, it's applied,
and everything in it is left on it, and no new content is exported to it.
Since the old behavior on these preferred content expressions was for
importtree to error out, there's no backwards compatability to worry about.
Except that sync/pull/etc will now import where before it errored out.
2023-12-18 20:27:26 +00:00
|
|
|
pruneImportMatcher :: Utility.Matcher.Matcher (MatchFiles a) -> Utility.Matcher.Matcher (MatchFiles a)
|
|
|
|
pruneImportMatcher = Utility.Matcher.pruneMatcher matchNeedsKey
|
|
|
|
|
2020-09-30 14:10:03 +00:00
|
|
|
{- Gets the ImportableContents from the remote.
|
|
|
|
-
|
|
|
|
- Filters out any paths that include a ".git" component, because git does
|
|
|
|
- not allow storing ".git" in a git repository. While it is possible to
|
|
|
|
- write a git tree that contains that, git will complain and refuse to
|
|
|
|
- check it out.
|
|
|
|
-
|
2020-09-30 14:41:59 +00:00
|
|
|
- Filters out new things not matching the FileMatcher or that are
|
|
|
|
- gitignored. However, files that are already in git get imported
|
|
|
|
- regardless. (Similar to how git add behaves on gitignored files.)
|
|
|
|
- This avoids creating a remote tracking branch that, when merged,
|
|
|
|
- would delete the files.
|
2020-12-22 18:20:11 +00:00
|
|
|
-
|
|
|
|
- Throws exception if unable to contact the remote.
|
2020-12-22 18:35:02 +00:00
|
|
|
- Returns Nothing when there is no change since last time.
|
2020-09-30 14:10:03 +00:00
|
|
|
-}
|
2021-10-06 21:05:32 +00:00
|
|
|
getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
|
2020-12-22 18:20:11 +00:00
|
|
|
getImportableContents r importtreeconfig ci matcher = do
|
2020-12-22 18:35:02 +00:00
|
|
|
Remote.listImportableContents (Remote.importActions r) >>= \case
|
2021-10-06 21:05:32 +00:00
|
|
|
Just (ImportableContentsComplete ic) -> do
|
|
|
|
dbhandle <- opendbhandle
|
|
|
|
Just . ImportableContentsComplete
|
|
|
|
<$> filterunwanted dbhandle ic
|
|
|
|
Just (c@(ImportableContentsChunked {})) -> do
|
|
|
|
dbhandle <- opendbhandle
|
|
|
|
Just <$> filterunwantedchunked dbhandle c
|
2020-12-22 18:35:02 +00:00
|
|
|
Nothing -> return Nothing
|
2019-05-21 18:38:00 +00:00
|
|
|
where
|
2020-09-30 14:10:03 +00:00
|
|
|
filterunwanted dbhandle ic = ImportableContents
|
|
|
|
<$> filterM (wanted dbhandle) (importableContents ic)
|
|
|
|
<*> mapM (filterunwanted dbhandle) (importableHistory ic)
|
|
|
|
|
2021-10-06 21:05:32 +00:00
|
|
|
filterunwantedchunked dbhandle c = ImportableContentsChunked
|
|
|
|
<$> filterunwantedchunk dbhandle (importableContentsChunk c)
|
|
|
|
<*> mapM (filterunwanted dbhandle) (importableHistoryComplete c)
|
|
|
|
|
|
|
|
filterunwantedchunk dbhandle c = ImportableContentsChunk
|
|
|
|
<$> pure (importableContentsSubDir c)
|
|
|
|
<*> filterM (wantedunder dbhandle (importableContentsSubDir c))
|
|
|
|
(importableContentsSubTree c)
|
|
|
|
<*> pure (
|
|
|
|
importableContentsNextChunk c >>= \case
|
|
|
|
Nothing -> return Nothing
|
|
|
|
Just c' -> Just <$> filterunwantedchunk dbhandle c'
|
|
|
|
)
|
|
|
|
|
2023-06-08 22:36:24 +00:00
|
|
|
opendbhandle = do
|
|
|
|
h <- Export.openDb (Remote.uuid r)
|
|
|
|
void $ Export.updateExportTreeFromLog h
|
|
|
|
return h
|
2021-10-06 21:05:32 +00:00
|
|
|
|
2020-09-30 14:10:03 +00:00
|
|
|
wanted dbhandle (loc, (_cid, sz))
|
avoid git check-ignore overhead on importing known files
isKnownImportLocation does a database lookup and there's an index
to make that lookup fast, so it's probably faster than talking to git
check-ignore. Checking the matcher is faster still.
While before the gitignore check was added it did not need to always
check isknown, now it does, because it's that or the more expensive
notignored. But at least we can skip notignored when a file is known,
which will often be the common case: Importing from a remote that's been
exported to, and/or imported from before, only new files will not be
known, so only those will need to check notignored.
At first, I had this:
(matches <&&> (isknown <||> notignored)) <||> isknown
Notice that checks isknown every time, whether it matches or not.
So, it's no slower to instead do this:
isknown <||> (matches <&&> notignored)
That has the benefit that, when it's known, it doesn't need to run
matches, which while faster than isknown, is still going to use some CPU.
And it perhaps more clearly expresses the condition: Any known file is
wanted, otherwise it's down to what matches and is not ignored.
This commit was sponsored by Jack Hill on Patren.
2020-09-30 15:09:09 +00:00
|
|
|
| ingitdir = pure False
|
|
|
|
| otherwise =
|
|
|
|
isknown <||> (matches <&&> notignored)
|
|
|
|
where
|
|
|
|
-- Checks, from least to most expensive.
|
|
|
|
ingitdir = ".git" `elem` Posix.splitDirectories (fromImportLocation loc)
|
|
|
|
matches = matchesImportLocation matcher loc sz
|
|
|
|
isknown = isKnownImportLocation dbhandle loc
|
|
|
|
notignored = notIgnoredImportLocation importtreeconfig ci loc
|
2021-10-06 21:05:32 +00:00
|
|
|
|
|
|
|
wantedunder dbhandle root (loc, v) =
|
|
|
|
wanted dbhandle (importableContentsChunkFullLocation root loc, v)
|
2019-05-21 18:38:00 +00:00
|
|
|
|
2020-09-30 14:41:59 +00:00
|
|
|
isKnownImportLocation :: Export.ExportHandle -> ImportLocation -> Annex Bool
|
|
|
|
isKnownImportLocation dbhandle loc = liftIO $
|
|
|
|
not . null <$> Export.getExportTreeKey dbhandle loc
|
|
|
|
|
avoid git check-ignore overhead on importing known files
isKnownImportLocation does a database lookup and there's an index
to make that lookup fast, so it's probably faster than talking to git
check-ignore. Checking the matcher is faster still.
While before the gitignore check was added it did not need to always
check isknown, now it does, because it's that or the more expensive
notignored. But at least we can skip notignored when a file is known,
which will often be the common case: Importing from a remote that's been
exported to, and/or imported from before, only new files will not be
known, so only those will need to check notignored.
At first, I had this:
(matches <&&> (isknown <||> notignored)) <||> isknown
Notice that checks isknown every time, whether it matches or not.
So, it's no slower to instead do this:
isknown <||> (matches <&&> notignored)
That has the benefit that, when it's known, it doesn't need to run
matches, which while faster than isknown, is still going to use some CPU.
And it perhaps more clearly expresses the condition: Any known file is
wanted, otherwise it's down to what matches and is not ignored.
This commit was sponsored by Jack Hill on Patren.
2020-09-30 15:09:09 +00:00
|
|
|
matchesImportLocation :: FileMatcher Annex -> ImportLocation -> Integer -> Annex Bool
|
2024-08-23 20:35:12 +00:00
|
|
|
matchesImportLocation matcher loc sz = checkMatcher' matcher mi NoLiveUpdate mempty
|
2019-06-04 19:14:20 +00:00
|
|
|
where
|
2020-09-30 14:10:03 +00:00
|
|
|
mi = MatchingInfo $ ProvidedInfo
|
2021-03-02 16:47:23 +00:00
|
|
|
{ providedFilePath = Just (fromImportLocation loc)
|
2020-09-30 14:10:03 +00:00
|
|
|
, providedKey = Nothing
|
2021-03-02 16:47:23 +00:00
|
|
|
, providedFileSize = Just sz
|
2020-09-30 14:10:03 +00:00
|
|
|
, providedMimeType = Nothing
|
|
|
|
, providedMimeEncoding = Nothing
|
2021-03-02 16:47:23 +00:00
|
|
|
, providedLinkType = Nothing
|
2019-06-04 19:14:20 +00:00
|
|
|
}
|
avoid git check-ignore overhead on importing known files
isKnownImportLocation does a database lookup and there's an index
to make that lookup fast, so it's probably faster than talking to git
check-ignore. Checking the matcher is faster still.
While before the gitignore check was added it did not need to always
check isknown, now it does, because it's that or the more expensive
notignored. But at least we can skip notignored when a file is known,
which will often be the common case: Importing from a remote that's been
exported to, and/or imported from before, only new files will not be
known, so only those will need to check notignored.
At first, I had this:
(matches <&&> (isknown <||> notignored)) <||> isknown
Notice that checks isknown every time, whether it matches or not.
So, it's no slower to instead do this:
isknown <||> (matches <&&> notignored)
That has the benefit that, when it's known, it doesn't need to run
matches, which while faster than isknown, is still going to use some CPU.
And it perhaps more clearly expresses the condition: Any known file is
wanted, otherwise it's down to what matches and is not ignored.
This commit was sponsored by Jack Hill on Patren.
2020-09-30 15:09:09 +00:00
|
|
|
|
|
|
|
notIgnoredImportLocation :: ImportTreeConfig -> CheckGitIgnore -> ImportLocation -> Annex Bool
|
|
|
|
notIgnoredImportLocation importtreeconfig ci loc = not <$> checkIgnored ci f
|
|
|
|
where
|
2020-11-03 22:34:27 +00:00
|
|
|
f = case importtreeconfig of
|
2020-09-30 14:41:59 +00:00
|
|
|
ImportSubTree dir _ ->
|
|
|
|
getTopFilePath dir P.</> fromImportLocation loc
|
|
|
|
ImportTree ->
|
|
|
|
fromImportLocation loc
|