git-annex/Command/Import.hs

321 lines
11 KiB
Haskell
Raw Normal View History

{- git-annex command
-
2019-02-26 16:06:19 +00:00
- Copyright 2012-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
2019-02-26 16:06:19 +00:00
{-# LANGUAGE ApplicativeDo #-}
module Command.Import where
import Command
import qualified Git
import qualified Annex
import qualified Command.Add
import qualified Command.Reinject
2019-02-26 16:06:19 +00:00
import qualified Types.Remote as Remote
import qualified Git.Ref
import Utility.CopyFile
import Backend
import Types.KeySource
import Annex.CheckIgnore
import Annex.NumCopies
import Annex.FileMatcher
import Annex.Ingest
import Annex.InodeSentinal
2019-02-26 16:06:19 +00:00
import Annex.Import
import Annex.RemoteTrackingBranch
import Utility.InodeCache
import Logs.Location
2019-02-26 16:06:19 +00:00
import Git.FilePath
import Git.Types
import Types.Import
import Utility.Metered
import Control.Concurrent.STM
cmd :: Command
cmd = notBareRepo $
withGlobalOptions [jobsOption, jsonOptions, fileMatchingOptions] $
command "import" SectionCommon
"add a tree of files to the repository"
(paramPaths ++ "|BRANCH[:SUBDIR]")
(seek <$$> optParser)
2019-02-26 16:06:19 +00:00
data ImportOptions
= LocalImportOptions
{ importFiles :: CmdParams
, duplicateMode :: DuplicateMode
}
| RemoteImportOptions
{ importFromRemote :: DeferredParse Remote
, importToBranch :: Branch
, importToSubDir :: Maybe FilePath
}
2015-07-13 15:15:21 +00:00
optParser :: CmdParamsDesc -> Parser ImportOptions
optParser desc = do
ps <- cmdParams desc
mfromremote <- optional $ parseRemoteOption <$> parseFromOption
dupmode <- fromMaybe Default <$> optional duplicateModeParser
return $ case mfromremote of
Nothing -> LocalImportOptions ps dupmode
Just r -> case ps of
[bs] ->
let (branch, subdir) = separate (== ':') bs
in RemoteImportOptions r
(Ref branch)
(if null subdir then Nothing else Just subdir)
_ -> giveup "expected BRANCH[:SUBDIR]"
2019-02-26 16:06:19 +00:00
data DuplicateMode = Default | Duplicate | DeDuplicate | CleanDuplicates | SkipDuplicates | ReinjectDuplicates
deriving (Eq)
2015-07-13 15:15:21 +00:00
duplicateModeParser :: Parser DuplicateMode
duplicateModeParser =
flag' Duplicate
( long "duplicate"
<> help "do not delete source files"
)
<|> flag' DeDuplicate
( long "deduplicate"
<> help "delete source files whose content was imported before"
)
<|> flag' CleanDuplicates
( long "clean-duplicates"
<> help "delete duplicate source files (import nothing)"
)
<|> flag' SkipDuplicates
( long "skip-duplicates"
<> help "import only new files (do not delete source files)"
)
<|> flag' ReinjectDuplicates
( long "reinject-duplicates"
<> help "import new files, and reinject the content of files that were imported before"
2015-07-13 15:15:21 +00:00
)
2015-07-13 15:15:21 +00:00
seek :: ImportOptions -> CommandSeek
seek o@(LocalImportOptions {}) = startConcurrency commandStages $ do
repopath <- liftIO . absPath =<< fromRepo Git.repoPath
2015-07-13 15:15:21 +00:00
inrepops <- liftIO $ filter (dirContains repopath) <$> mapM absPath (importFiles o)
unless (null inrepops) $ do
giveup $ "cannot import files from inside the working tree (use git annex add instead): " ++ unwords inrepops
largematcher <- largeFilesMatcher
2019-02-26 16:06:19 +00:00
(commandAction . startLocal largematcher (duplicateMode o))
`withPathContents` importFiles o
seek o@(RemoteImportOptions {}) = startConcurrency commandStages $ do
2019-02-26 16:06:19 +00:00
r <- getParsed (importFromRemote o)
unlessM (Remote.isImportSupported r) $
giveup "That remote does not support imports."
2019-02-26 16:06:19 +00:00
subdir <- maybe
(pure Nothing)
(Just <$$> inRepo . toTopFilePath)
(importToSubDir o)
2019-02-26 18:22:08 +00:00
seekRemote r (importToBranch o) subdir
2019-02-26 16:06:19 +00:00
startLocal :: GetFileMatcher -> DuplicateMode -> (FilePath, FilePath) -> CommandStart
startLocal largematcher mode (srcfile, destfile) =
ifM (liftIO $ isRegularFile <$> getSymbolicLinkStatus srcfile)
( starting "import" (ActionItemWorkTreeFile (toRawFilePath destfile))
make CommandStart return a StartMessage The goal is to be able to run CommandStart in the main thread when -J is used, rather than unncessarily passing it off to a worker thread, which incurs overhead that is signficant when the CommandStart is going to quickly decide to stop. To do that, the message it displays needs to be displayed in the worker thread, after the CommandStart has run. Also, the change will mean that CommandStart will no longer necessarily run with the same Annex state as CommandPerform. While its docs already said it should avoid modifying Annex state, I audited all the CommandStart code as part of the conversion. (Note that CommandSeek already sometimes runs with a different Annex state, and that has not been a source of any problems, so I am not too worried that this change will lead to breakage going forward.) The only modification of Annex state I found was it calling allowMessages in some Commands that default to noMessages. Dealt with that by adding a startCustomOutput and a startingUsualMessages. This lets a command start with noMessages and then select the output it wants for each CommandStart. One bit of breakage: onlyActionOn has been removed from commands that used it. The plan is that, since a StartMessage contains an ActionItem, when a Key can be extracted from that, the parallel job runner can run onlyActionOn' automatically. Then commands won't need to worry about this detail. Future work. Otherwise, this was a fairly straightforward process of making each CommandStart compile again. Hopefully other behavior changes were mostly avoided. In a few cases, a command had a CommandStart that called a CommandPerform that then called showStart multiple times. I have collapsed those down to a single start action. The main command to perhaps suffer from it is Command.Direct, which used to show a start for each file, and no longer does. Another minor behavior change is that some commands used showStart before, but had an associated file and a Key available, so were changed to ShowStart with an ActionItemAssociatedFile. That will not change the normal output or behavior, but --json output will now include the key. This should not break it for anyone using a real json parser.
2019-06-06 19:42:30 +00:00
pickaction
, stop
)
where
deletedup k = do
showNote $ "duplicate of " ++ serializeKey k
2015-10-09 15:09:46 +00:00
verifyExisting k destfile
( do
liftIO $ removeFile srcfile
next $ return True
, do
2015-04-30 18:10:28 +00:00
warning "Could not verify that the content is still present in the annex; not removing from the import location."
stop
)
reinject k = do
showNote "reinjecting"
Command.Reinject.perform srcfile k
importfile ld k = checkdestdir $ do
ignored <- not <$> Annex.getState Annex.force <&&> checkIgnored destfile
if ignored
then do
warning $ "not importing " ++ destfile ++ " which is .gitignored (use --force to override)"
stop
else do
existing <- liftIO (catchMaybeIO $ getSymbolicLinkStatus destfile)
case existing of
Nothing -> importfilechecked ld k
Just s
| isDirectory s -> notoverwriting "(is a directory)"
| isSymbolicLink s -> ifM (Annex.getState Annex.force)
( do
liftIO $ nukeFile destfile
importfilechecked ld k
, notoverwriting "(is a symlink)"
)
| otherwise -> ifM (Annex.getState Annex.force)
( do
liftIO $ nukeFile destfile
importfilechecked ld k
, notoverwriting "(use --force to override, or a duplication option such as --deduplicate to clean up)"
)
checkdestdir cont = do
let destdir = parentDir destfile
existing <- liftIO (catchMaybeIO $ getSymbolicLinkStatus destdir)
case existing of
Nothing -> cont
Just s
| isDirectory s -> cont
| otherwise -> do
warning $ "not importing " ++ destfile ++ " because " ++ destdir ++ " is not a directory"
stop
importfilechecked ld k = do
-- Move or copy the src file to the dest file.
-- The dest file is what will be ingested.
liftIO $ createDirectoryIfMissing True (parentDir destfile)
liftIO $ if mode == Duplicate || mode == SkipDuplicates
then void $ copyFileExternal CopyAllMetaData srcfile destfile
else moveFile srcfile destfile
-- Get the inode cache of the dest file. It should be
-- weakly the same as the origianlly locked down file's
-- inode cache. (Since the file may have been copied,
-- its inodes may not be the same.)
newcache <- withTSDelta $ liftIO . genInodeCache destfile
let unchanged = case (newcache, inodeCache (keySource ld)) of
(_, Nothing) -> True
(Just newc, Just c) | compareWeak c newc -> True
_ -> False
unless unchanged $
giveup "changed while it was being added"
-- The LockedDown needs to be adjusted, since the destfile
-- is what will be ingested.
let ld' = ld
{ keySource = KeySource
{ keyFilename = destfile
, contentLocation = destfile
, inodeCache = newcache
}
}
ifM (checkFileMatcher largematcher destfile)
( ingestAdd' nullMeterUpdate (Just ld') (Just k)
>>= maybe
stop
(\addedk -> next $ Command.Add.cleanup addedk True)
, next $ Command.Add.addSmall $ toRawFilePath destfile
)
notoverwriting why = do
warning $ "not overwriting existing " ++ destfile ++ " " ++ why
stop
lockdown a = do
lockingfile <- not <$> addUnlocked
-- Minimal lock down with no hard linking so nothing
-- has to be done to clean up from it.
let cfg = LockDownConfig
{ lockingFile = lockingfile
, hardlinkFileTmpDir = Nothing
}
v <- lockDown cfg srcfile
case v of
Just ld -> do
backend <- chooseBackend destfile
v' <- genKey (keySource ld) nullMeterUpdate backend
case v' of
Just (k, _) -> a (ld, k)
Nothing -> giveup "failed to generate a key"
Nothing -> stop
checkdup k dupa notdupa = ifM (isKnownKey k)
( dupa
, notdupa
)
pickaction = lockdown $ \(ld, k) -> case mode of
DeDuplicate -> checkdup k (deletedup k) (importfile ld k)
CleanDuplicates -> checkdup k
(deletedup k)
(skipbecause "not duplicate")
SkipDuplicates -> checkdup k
(skipbecause "duplicate")
(importfile ld k)
ReinjectDuplicates -> checkdup k
(reinject k)
(importfile ld k)
_ -> importfile ld k
skipbecause s = showNote (s ++ "; skipping") >> next (return True)
2015-10-09 15:09:46 +00:00
verifyExisting :: Key -> FilePath -> (CommandPerform, CommandPerform) -> CommandPerform
verifyExisting key destfile (yes, no) = do
-- Look up the numcopies setting for the file that it would be
-- imported to, if it were imported.
need <- getFileNumCopies destfile
(tocheck, preverified) <- verifiableCopies key []
verifyEnoughCopiesToDrop [] key Nothing need [] preverified tocheck
2015-10-09 15:09:46 +00:00
(const yes) no
2019-02-26 16:06:19 +00:00
2019-02-26 18:22:08 +00:00
seekRemote :: Remote -> Branch -> Maybe TopFilePath -> CommandSeek
2019-03-09 17:34:57 +00:00
seekRemote remote branch msubdir = do
importtreeconfig <- case msubdir of
Nothing -> return ImportTree
Just subdir ->
let mk tree = pure $ ImportSubTree subdir tree
in fromtrackingbranch Git.Ref.tree >>= \case
Just tree -> mk tree
Nothing -> inRepo (Git.Ref.tree branch) >>= \case
Just tree -> mk tree
Nothing -> giveup $ "Unable to find base tree for branch " ++ fromRef branch
trackingcommit <- fromtrackingbranch Git.Ref.sha
cmode <- annexCommitMode <$> Annex.getGitConfig
let importcommitconfig = ImportCommitConfig trackingcommit cmode importmessage
let commitimport = commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig
2019-02-26 18:22:08 +00:00
importabletvar <- liftIO $ newTVarIO Nothing
void $ includeCommandAction (listContents remote importabletvar)
liftIO (atomically (readTVar importabletvar)) >>= \case
Nothing -> return ()
Just importable -> downloadImport remote importtreeconfig importable >>= \case
Nothing -> warning $ concat
[ "Failed to import some files from "
, Remote.name remote
, ". Re-run command to resume import."
]
Just imported -> void $
includeCommandAction $
commitimport imported
where
importmessage = "import from " ++ Remote.name remote
tb = mkRemoteTrackingBranch remote branch
fromtrackingbranch a = inRepo $ a (fromRemoteTrackingBranch tb)
2019-02-26 18:22:08 +00:00
listContents :: Remote -> TVar (Maybe (ImportableContents (ContentIdentifier, Remote.ByteSize))) -> CommandStart
make CommandStart return a StartMessage The goal is to be able to run CommandStart in the main thread when -J is used, rather than unncessarily passing it off to a worker thread, which incurs overhead that is signficant when the CommandStart is going to quickly decide to stop. To do that, the message it displays needs to be displayed in the worker thread, after the CommandStart has run. Also, the change will mean that CommandStart will no longer necessarily run with the same Annex state as CommandPerform. While its docs already said it should avoid modifying Annex state, I audited all the CommandStart code as part of the conversion. (Note that CommandSeek already sometimes runs with a different Annex state, and that has not been a source of any problems, so I am not too worried that this change will lead to breakage going forward.) The only modification of Annex state I found was it calling allowMessages in some Commands that default to noMessages. Dealt with that by adding a startCustomOutput and a startingUsualMessages. This lets a command start with noMessages and then select the output it wants for each CommandStart. One bit of breakage: onlyActionOn has been removed from commands that used it. The plan is that, since a StartMessage contains an ActionItem, when a Key can be extracted from that, the parallel job runner can run onlyActionOn' automatically. Then commands won't need to worry about this detail. Future work. Otherwise, this was a fairly straightforward process of making each CommandStart compile again. Hopefully other behavior changes were mostly avoided. In a few cases, a command had a CommandStart that called a CommandPerform that then called showStart multiple times. I have collapsed those down to a single start action. The main command to perhaps suffer from it is Command.Direct, which used to show a start for each file, and no longer does. Another minor behavior change is that some commands used showStart before, but had an associated file and a Key available, so were changed to ShowStart with an ActionItemAssociatedFile. That will not change the normal output or behavior, but --json output will now include the key. This should not break it for anyone using a real json parser.
2019-06-06 19:42:30 +00:00
listContents remote tvar = starting "list" (ActionItemOther (Just (Remote.name remote))) $
listImportableContents remote >>= \case
Nothing -> giveup $ "Unable to list contents of " ++ Remote.name remote
Just importable -> do
importable' <- makeImportMatcher remote >>= \case
Right matcher -> filterImportableContents remote matcher importable
Left err -> giveup $ "Cannot import from " ++ Remote.name remote ++ " because of a problem with its configuration: " ++ err
next $ do
liftIO $ atomically $ writeTVar tvar (Just importable')
return True
2019-02-26 18:22:08 +00:00
commitRemote :: Remote -> Branch -> RemoteTrackingBranch -> Maybe Sha -> ImportTreeConfig -> ImportCommitConfig -> ImportableContents Key -> CommandStart
make CommandStart return a StartMessage The goal is to be able to run CommandStart in the main thread when -J is used, rather than unncessarily passing it off to a worker thread, which incurs overhead that is signficant when the CommandStart is going to quickly decide to stop. To do that, the message it displays needs to be displayed in the worker thread, after the CommandStart has run. Also, the change will mean that CommandStart will no longer necessarily run with the same Annex state as CommandPerform. While its docs already said it should avoid modifying Annex state, I audited all the CommandStart code as part of the conversion. (Note that CommandSeek already sometimes runs with a different Annex state, and that has not been a source of any problems, so I am not too worried that this change will lead to breakage going forward.) The only modification of Annex state I found was it calling allowMessages in some Commands that default to noMessages. Dealt with that by adding a startCustomOutput and a startingUsualMessages. This lets a command start with noMessages and then select the output it wants for each CommandStart. One bit of breakage: onlyActionOn has been removed from commands that used it. The plan is that, since a StartMessage contains an ActionItem, when a Key can be extracted from that, the parallel job runner can run onlyActionOn' automatically. Then commands won't need to worry about this detail. Future work. Otherwise, this was a fairly straightforward process of making each CommandStart compile again. Hopefully other behavior changes were mostly avoided. In a few cases, a command had a CommandStart that called a CommandPerform that then called showStart multiple times. I have collapsed those down to a single start action. The main command to perhaps suffer from it is Command.Direct, which used to show a start for each file, and no longer does. Another minor behavior change is that some commands used showStart before, but had an associated file and a Key available, so were changed to ShowStart with an ActionItemAssociatedFile. That will not change the normal output or behavior, but --json output will now include the key. This should not break it for anyone using a real json parser.
2019-06-06 19:42:30 +00:00
commitRemote remote branch tb trackingcommit importtreeconfig importcommitconfig importable =
starting "update" (ActionItemOther (Just $ fromRef $ fromRemoteTrackingBranch tb)) $ do
2019-02-26 18:22:08 +00:00
importcommit <- buildImportCommit remote importtreeconfig importcommitconfig importable
next $ updateremotetrackingbranch importcommit
where
-- Update the tracking branch. Done even when there
-- is nothing new to import, to make sure it exists.
updateremotetrackingbranch importcommit =
case importcommit <|> trackingcommit of
2019-02-26 18:22:08 +00:00
Just c -> do
setRemoteTrackingBranch tb c
2019-02-26 18:22:08 +00:00
return True
Nothing -> do
warning $ "Nothing to import and " ++ fromRef branch ++ " does not exist."
return False