Make import --deduplicate and --skip-duplicates only hash once, not twice
import: --deduplicate and --skip-duplicates were implemented inneficiently; they unncessarily hashed each file twice. They have been improved to only hash once. The new approach is to lock down (minimally) and hash files, and then reuse that information when importing them. This was rather tricky, especially in detecting changes to files while they are being imported. The output of import changed slightly. While before it silently skipped over files with eg --skip-duplicates, now it shows each file as it starts to act on it. Since every file is hashed first thing, it would otherwise not be clear what file import is chewing on. (Actually, it wasn't clear before when any of the duplicates switches were used.) This commit was sponsored by Alexander Thompson on Patreon.
This commit is contained in:
parent
30ab4ecc4b
commit
f617988a29
5 changed files with 90 additions and 41 deletions
|
@ -1,6 +1,6 @@
|
|||
{- git-annex content ingestion
|
||||
-
|
||||
- Copyright 2010-2016 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2010-2017 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU GPL version 3 or higher.
|
||||
-}
|
||||
|
@ -10,6 +10,7 @@ module Annex.Ingest (
|
|||
LockDownConfig(..),
|
||||
lockDown,
|
||||
ingestAdd,
|
||||
ingestAdd',
|
||||
ingest,
|
||||
ingest',
|
||||
finishIngestDirect,
|
||||
|
@ -116,10 +117,13 @@ lockDown' cfg file = ifM (pure (not (hardlinkFileTmp cfg)) <||> crippledFileSyst
|
|||
{- Ingests a locked down file into the annex. Updates the work tree and
|
||||
- index. -}
|
||||
ingestAdd :: Maybe LockedDown -> Annex (Maybe Key)
|
||||
ingestAdd Nothing = return Nothing
|
||||
ingestAdd ld@(Just (LockedDown cfg source)) = do
|
||||
(mk, mic) <- ingest ld
|
||||
case mk of
|
||||
ingestAdd ld = ingestAdd' ld Nothing
|
||||
|
||||
ingestAdd' :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key)
|
||||
ingestAdd' Nothing _ = return Nothing
|
||||
ingestAdd' ld@(Just (LockedDown cfg source)) mk = do
|
||||
(mk', mic) <- ingest ld mk
|
||||
case mk' of
|
||||
Nothing -> return Nothing
|
||||
Just k -> do
|
||||
let f = keyFilename source
|
||||
|
@ -140,14 +144,17 @@ ingestAdd ld@(Just (LockedDown cfg source)) = do
|
|||
{- Ingests a locked down file into the annex. Does not update the working
|
||||
- tree or the index.
|
||||
-}
|
||||
ingest :: Maybe LockedDown -> Annex (Maybe Key, Maybe InodeCache)
|
||||
ingest :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache)
|
||||
ingest = ingest' Nothing
|
||||
|
||||
ingest' :: Maybe Backend -> Maybe LockedDown -> Annex (Maybe Key, Maybe InodeCache)
|
||||
ingest' _ Nothing = return (Nothing, Nothing)
|
||||
ingest' preferredbackend (Just (LockedDown cfg source)) = withTSDelta $ \delta -> do
|
||||
backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend
|
||||
k <- genKey source backend
|
||||
ingest' :: Maybe Backend -> Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache)
|
||||
ingest' _ Nothing _ = return (Nothing, Nothing)
|
||||
ingest' preferredbackend (Just (LockedDown cfg source)) mk = withTSDelta $ \delta -> do
|
||||
k <- case mk of
|
||||
Nothing -> do
|
||||
backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend
|
||||
fmap fst <$> genKey source backend
|
||||
Just k -> return (Just k)
|
||||
let src = contentLocation source
|
||||
ms <- liftIO $ catchMaybeIO $ getFileStatus src
|
||||
mcache <- maybe (pure Nothing) (liftIO . toInodeCache delta src) ms
|
||||
|
@ -156,7 +163,7 @@ ingest' preferredbackend (Just (LockedDown cfg source)) = withTSDelta $ \delta -
|
|||
(Just newc, Just c) | compareStrong c newc -> go k mcache ms
|
||||
_ -> failure "changed while it was being added"
|
||||
where
|
||||
go (Just (key, _)) mcache (Just s)
|
||||
go (Just key) mcache (Just s)
|
||||
| lockingFile cfg = golocked key mcache s
|
||||
| otherwise = ifM isDirect
|
||||
( godirect key mcache s
|
||||
|
|
|
@ -322,7 +322,7 @@ handleAdds havelsof delayadd cs = returnWhen (null incomplete) $ do
|
|||
doadd = sanitycheck ks $ do
|
||||
(mkey, mcache) <- liftAnnex $ do
|
||||
showStart "add" $ keyFilename ks
|
||||
ingest $ Just $ LockedDown lockdownconfig ks
|
||||
ingest (Just $ LockedDown lockdownconfig ks) Nothing
|
||||
maybe (failedingest change) (done change mcache $ keyFilename ks) mkey
|
||||
add _ _ = return Nothing
|
||||
|
||||
|
|
|
@ -43,6 +43,9 @@ git-annex (6.20170102) UNRELEASED; urgency=medium
|
|||
Note that --clean-duplicates and --deduplicate still check
|
||||
numcopies, so won't delete duplicate files unless there's an annexed
|
||||
copy.
|
||||
* import: --deduplicate and --skip-duplicates were implemented
|
||||
inneficiently; they unncessarily hashed each file twice. They have
|
||||
been improved to only hash once.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Fri, 06 Jan 2017 15:22:06 -0400
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- git-annex command
|
||||
-
|
||||
- Copyright 2012-2013 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2012-2017 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU GPL version 3 or higher.
|
||||
-}
|
||||
|
@ -13,11 +13,13 @@ import qualified Annex
|
|||
import qualified Command.Add
|
||||
import Utility.CopyFile
|
||||
import Backend
|
||||
import Remote
|
||||
import Types.KeySource
|
||||
import Annex.CheckIgnore
|
||||
import Annex.NumCopies
|
||||
import Annex.FileMatcher
|
||||
import Annex.Ingest
|
||||
import Annex.InodeSentinal
|
||||
import Utility.InodeCache
|
||||
import Logs.Location
|
||||
|
||||
cmd :: Command
|
||||
|
@ -71,12 +73,8 @@ start :: GetFileMatcher -> DuplicateMode -> (FilePath, FilePath) -> CommandStart
|
|||
start largematcher mode (srcfile, destfile) =
|
||||
ifM (liftIO $ isRegularFile <$> getSymbolicLinkStatus srcfile)
|
||||
( do
|
||||
ma <- pickaction
|
||||
case ma of
|
||||
Nothing -> stop
|
||||
Just a -> do
|
||||
showStart "import" destfile
|
||||
next a
|
||||
showStart "import" destfile
|
||||
next pickaction
|
||||
, stop
|
||||
)
|
||||
where
|
||||
|
@ -90,7 +88,7 @@ start largematcher mode (srcfile, destfile) =
|
|||
warning "Could not verify that the content is still present in the annex; not removing from the import location."
|
||||
stop
|
||||
)
|
||||
importfile = checkdestdir $ do
|
||||
importfile ld k = checkdestdir $ do
|
||||
ignored <- not <$> Annex.getState Annex.force <&&> checkIgnored destfile
|
||||
if ignored
|
||||
then do
|
||||
|
@ -99,14 +97,14 @@ start largematcher mode (srcfile, destfile) =
|
|||
else do
|
||||
existing <- liftIO (catchMaybeIO $ getSymbolicLinkStatus destfile)
|
||||
case existing of
|
||||
Nothing -> importfilechecked
|
||||
Nothing -> importfilechecked ld k
|
||||
Just s
|
||||
| isDirectory s -> notoverwriting "(is a directory)"
|
||||
| isSymbolicLink s -> notoverwriting "(is a symlink)"
|
||||
| otherwise -> ifM (Annex.getState Annex.force)
|
||||
( do
|
||||
liftIO $ nukeFile destfile
|
||||
importfilechecked
|
||||
importfilechecked ld k
|
||||
, notoverwriting "(use --force to override, or a duplication option such as --deduplicate to clean up)"
|
||||
)
|
||||
checkdestdir cont = do
|
||||
|
@ -120,33 +118,74 @@ start largematcher mode (srcfile, destfile) =
|
|||
warning $ "not importing " ++ destfile ++ " because " ++ destdir ++ " is not a directory"
|
||||
stop
|
||||
|
||||
importfilechecked = do
|
||||
importfilechecked ld k = do
|
||||
-- Move or copy the src file to the dest file.
|
||||
-- The dest file is what will be ingested.
|
||||
liftIO $ createDirectoryIfMissing True (parentDir destfile)
|
||||
liftIO $ if mode == Duplicate || mode == SkipDuplicates
|
||||
then void $ copyFileExternal CopyAllMetaData srcfile destfile
|
||||
else moveFile srcfile destfile
|
||||
-- Get the inode cache of the dest file. It should be
|
||||
-- weakly the same as the origianlly locked down file's
|
||||
-- inode cache. (Since the file may have been copied,
|
||||
-- its inodes may not be the same.)
|
||||
newcache <- withTSDelta $ liftIO . genInodeCache destfile
|
||||
let unchanged = case (newcache, inodeCache (keySource ld)) of
|
||||
(_, Nothing) -> True
|
||||
(Just newc, Just c) | compareWeak c newc -> True
|
||||
_ -> False
|
||||
unless unchanged $
|
||||
giveup "changed while it was being added"
|
||||
-- The LockedDown needs to be adjusted, since the destfile
|
||||
-- is what will be ingested.
|
||||
let ld' = ld
|
||||
{ keySource = KeySource
|
||||
{ keyFilename = destfile
|
||||
, contentLocation = destfile
|
||||
, inodeCache = newcache
|
||||
}
|
||||
}
|
||||
ifM (checkFileMatcher largematcher destfile)
|
||||
( Command.Add.perform destfile
|
||||
( ingestAdd' (Just ld') (Just k)
|
||||
>>= maybe
|
||||
stop
|
||||
(\addedk -> next $ Command.Add.cleanup addedk True)
|
||||
, next $ Command.Add.addSmall destfile
|
||||
)
|
||||
notoverwriting why = do
|
||||
warning $ "not overwriting existing " ++ destfile ++ " " ++ why
|
||||
stop
|
||||
checkdup dupa notdupa = do
|
||||
backend <- chooseBackend destfile
|
||||
let ks = KeySource srcfile srcfile Nothing
|
||||
v <- genKey ks backend
|
||||
lockdown a = do
|
||||
lockingfile <- not <$> addUnlocked
|
||||
-- Minimal lock down with no hard linking so nothing
|
||||
-- has to be done to clean up from it.
|
||||
let cfg = LockDownConfig
|
||||
{ lockingFile = lockingfile
|
||||
, hardlinkFileTmp = False
|
||||
}
|
||||
v <- lockDown cfg srcfile
|
||||
case v of
|
||||
Just (k, _) -> ifM (isKnownKey k)
|
||||
( return (maybe Nothing (\a -> Just (a k)) dupa)
|
||||
, return notdupa
|
||||
)
|
||||
_ -> return notdupa
|
||||
pickaction = case mode of
|
||||
DeDuplicate -> checkdup (Just deletedup) (Just importfile)
|
||||
CleanDuplicates -> checkdup (Just deletedup) Nothing
|
||||
SkipDuplicates -> checkdup Nothing (Just importfile)
|
||||
_ -> return (Just importfile)
|
||||
Just ld -> do
|
||||
backend <- chooseBackend destfile
|
||||
v' <- genKey (keySource ld) backend
|
||||
case v' of
|
||||
Just (k, _) -> a (ld, k)
|
||||
Nothing -> giveup "failed to generate a key"
|
||||
Nothing -> stop
|
||||
checkdup k dupa notdupa = ifM (isKnownKey k)
|
||||
( dupa
|
||||
, notdupa
|
||||
)
|
||||
pickaction = lockdown $ \(ld, k) -> case mode of
|
||||
DeDuplicate -> checkdup k (deletedup k) (importfile ld k)
|
||||
CleanDuplicates -> checkdup k
|
||||
(deletedup k)
|
||||
(skipbecause "not duplicate")
|
||||
SkipDuplicates -> checkdup k
|
||||
(skipbecause "duplicate")
|
||||
(importfile ld k)
|
||||
_ -> importfile ld k
|
||||
skipbecause s = showNote (s ++ "; skipping") >> next (return True)
|
||||
|
||||
verifyExisting :: Key -> FilePath -> (CommandPerform, CommandPerform) -> CommandPerform
|
||||
verifyExisting key destfile (yes, no) = do
|
||||
|
|
|
@ -88,7 +88,7 @@ clean file = do
|
|||
<$> catKeyFile file
|
||||
liftIO . emitPointer
|
||||
=<< go
|
||||
=<< ingest' currbackend
|
||||
=<< (\ld -> ingest' currbackend ld Nothing)
|
||||
=<< lockDown cfg file
|
||||
, liftIO $ B.hPut stdout b
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue