Make import --deduplicate and --skip-duplicates only hash once, not twice

import: --deduplicate and --skip-duplicates were implemented inneficiently;
they unncessarily hashed each file twice. They have been improved to only
hash once.

The new approach is to lock down (minimally) and hash files, and then
reuse that information when importing them.

This was rather tricky, especially in detecting changes to files while
they are being imported.

The output of import changed slightly. While before it silently skipped
over files with eg --skip-duplicates, now it shows each file as it starts
to act on it. Since every file is hashed first thing, it would otherwise
not be clear what file import is chewing on. (Actually, it wasn't clear
before when any of the duplicates switches were used.)

This commit was sponsored by Alexander Thompson on Patreon.
This commit is contained in:
Joey Hess 2017-02-09 15:32:22 -04:00
parent 30ab4ecc4b
commit f617988a29
No known key found for this signature in database
GPG key ID: C910D9222512E3C7
5 changed files with 90 additions and 41 deletions

View file

@ -1,6 +1,6 @@
{- git-annex content ingestion {- git-annex content ingestion
- -
- Copyright 2010-2016 Joey Hess <id@joeyh.name> - Copyright 2010-2017 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -10,6 +10,7 @@ module Annex.Ingest (
LockDownConfig(..), LockDownConfig(..),
lockDown, lockDown,
ingestAdd, ingestAdd,
ingestAdd',
ingest, ingest,
ingest', ingest',
finishIngestDirect, finishIngestDirect,
@ -116,10 +117,13 @@ lockDown' cfg file = ifM (pure (not (hardlinkFileTmp cfg)) <||> crippledFileSyst
{- Ingests a locked down file into the annex. Updates the work tree and {- Ingests a locked down file into the annex. Updates the work tree and
- index. -} - index. -}
ingestAdd :: Maybe LockedDown -> Annex (Maybe Key) ingestAdd :: Maybe LockedDown -> Annex (Maybe Key)
ingestAdd Nothing = return Nothing ingestAdd ld = ingestAdd' ld Nothing
ingestAdd ld@(Just (LockedDown cfg source)) = do
(mk, mic) <- ingest ld ingestAdd' :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key)
case mk of ingestAdd' Nothing _ = return Nothing
ingestAdd' ld@(Just (LockedDown cfg source)) mk = do
(mk', mic) <- ingest ld mk
case mk' of
Nothing -> return Nothing Nothing -> return Nothing
Just k -> do Just k -> do
let f = keyFilename source let f = keyFilename source
@ -140,14 +144,17 @@ ingestAdd ld@(Just (LockedDown cfg source)) = do
{- Ingests a locked down file into the annex. Does not update the working {- Ingests a locked down file into the annex. Does not update the working
- tree or the index. - tree or the index.
-} -}
ingest :: Maybe LockedDown -> Annex (Maybe Key, Maybe InodeCache) ingest :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache)
ingest = ingest' Nothing ingest = ingest' Nothing
ingest' :: Maybe Backend -> Maybe LockedDown -> Annex (Maybe Key, Maybe InodeCache) ingest' :: Maybe Backend -> Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache)
ingest' _ Nothing = return (Nothing, Nothing) ingest' _ Nothing _ = return (Nothing, Nothing)
ingest' preferredbackend (Just (LockedDown cfg source)) = withTSDelta $ \delta -> do ingest' preferredbackend (Just (LockedDown cfg source)) mk = withTSDelta $ \delta -> do
k <- case mk of
Nothing -> do
backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend
k <- genKey source backend fmap fst <$> genKey source backend
Just k -> return (Just k)
let src = contentLocation source let src = contentLocation source
ms <- liftIO $ catchMaybeIO $ getFileStatus src ms <- liftIO $ catchMaybeIO $ getFileStatus src
mcache <- maybe (pure Nothing) (liftIO . toInodeCache delta src) ms mcache <- maybe (pure Nothing) (liftIO . toInodeCache delta src) ms
@ -156,7 +163,7 @@ ingest' preferredbackend (Just (LockedDown cfg source)) = withTSDelta $ \delta -
(Just newc, Just c) | compareStrong c newc -> go k mcache ms (Just newc, Just c) | compareStrong c newc -> go k mcache ms
_ -> failure "changed while it was being added" _ -> failure "changed while it was being added"
where where
go (Just (key, _)) mcache (Just s) go (Just key) mcache (Just s)
| lockingFile cfg = golocked key mcache s | lockingFile cfg = golocked key mcache s
| otherwise = ifM isDirect | otherwise = ifM isDirect
( godirect key mcache s ( godirect key mcache s

View file

@ -322,7 +322,7 @@ handleAdds havelsof delayadd cs = returnWhen (null incomplete) $ do
doadd = sanitycheck ks $ do doadd = sanitycheck ks $ do
(mkey, mcache) <- liftAnnex $ do (mkey, mcache) <- liftAnnex $ do
showStart "add" $ keyFilename ks showStart "add" $ keyFilename ks
ingest $ Just $ LockedDown lockdownconfig ks ingest (Just $ LockedDown lockdownconfig ks) Nothing
maybe (failedingest change) (done change mcache $ keyFilename ks) mkey maybe (failedingest change) (done change mcache $ keyFilename ks) mkey
add _ _ = return Nothing add _ _ = return Nothing

View file

@ -43,6 +43,9 @@ git-annex (6.20170102) UNRELEASED; urgency=medium
Note that --clean-duplicates and --deduplicate still check Note that --clean-duplicates and --deduplicate still check
numcopies, so won't delete duplicate files unless there's an annexed numcopies, so won't delete duplicate files unless there's an annexed
copy. copy.
* import: --deduplicate and --skip-duplicates were implemented
inneficiently; they unncessarily hashed each file twice. They have
been improved to only hash once.
-- Joey Hess <id@joeyh.name> Fri, 06 Jan 2017 15:22:06 -0400 -- Joey Hess <id@joeyh.name> Fri, 06 Jan 2017 15:22:06 -0400

View file

@ -1,6 +1,6 @@
{- git-annex command {- git-annex command
- -
- Copyright 2012-2013 Joey Hess <id@joeyh.name> - Copyright 2012-2017 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -13,11 +13,13 @@ import qualified Annex
import qualified Command.Add import qualified Command.Add
import Utility.CopyFile import Utility.CopyFile
import Backend import Backend
import Remote
import Types.KeySource import Types.KeySource
import Annex.CheckIgnore import Annex.CheckIgnore
import Annex.NumCopies import Annex.NumCopies
import Annex.FileMatcher import Annex.FileMatcher
import Annex.Ingest
import Annex.InodeSentinal
import Utility.InodeCache
import Logs.Location import Logs.Location
cmd :: Command cmd :: Command
@ -71,12 +73,8 @@ start :: GetFileMatcher -> DuplicateMode -> (FilePath, FilePath) -> CommandStart
start largematcher mode (srcfile, destfile) = start largematcher mode (srcfile, destfile) =
ifM (liftIO $ isRegularFile <$> getSymbolicLinkStatus srcfile) ifM (liftIO $ isRegularFile <$> getSymbolicLinkStatus srcfile)
( do ( do
ma <- pickaction
case ma of
Nothing -> stop
Just a -> do
showStart "import" destfile showStart "import" destfile
next a next pickaction
, stop , stop
) )
where where
@ -90,7 +88,7 @@ start largematcher mode (srcfile, destfile) =
warning "Could not verify that the content is still present in the annex; not removing from the import location." warning "Could not verify that the content is still present in the annex; not removing from the import location."
stop stop
) )
importfile = checkdestdir $ do importfile ld k = checkdestdir $ do
ignored <- not <$> Annex.getState Annex.force <&&> checkIgnored destfile ignored <- not <$> Annex.getState Annex.force <&&> checkIgnored destfile
if ignored if ignored
then do then do
@ -99,14 +97,14 @@ start largematcher mode (srcfile, destfile) =
else do else do
existing <- liftIO (catchMaybeIO $ getSymbolicLinkStatus destfile) existing <- liftIO (catchMaybeIO $ getSymbolicLinkStatus destfile)
case existing of case existing of
Nothing -> importfilechecked Nothing -> importfilechecked ld k
Just s Just s
| isDirectory s -> notoverwriting "(is a directory)" | isDirectory s -> notoverwriting "(is a directory)"
| isSymbolicLink s -> notoverwriting "(is a symlink)" | isSymbolicLink s -> notoverwriting "(is a symlink)"
| otherwise -> ifM (Annex.getState Annex.force) | otherwise -> ifM (Annex.getState Annex.force)
( do ( do
liftIO $ nukeFile destfile liftIO $ nukeFile destfile
importfilechecked importfilechecked ld k
, notoverwriting "(use --force to override, or a duplication option such as --deduplicate to clean up)" , notoverwriting "(use --force to override, or a duplication option such as --deduplicate to clean up)"
) )
checkdestdir cont = do checkdestdir cont = do
@ -120,33 +118,74 @@ start largematcher mode (srcfile, destfile) =
warning $ "not importing " ++ destfile ++ " because " ++ destdir ++ " is not a directory" warning $ "not importing " ++ destfile ++ " because " ++ destdir ++ " is not a directory"
stop stop
importfilechecked = do importfilechecked ld k = do
-- Move or copy the src file to the dest file.
-- The dest file is what will be ingested.
liftIO $ createDirectoryIfMissing True (parentDir destfile) liftIO $ createDirectoryIfMissing True (parentDir destfile)
liftIO $ if mode == Duplicate || mode == SkipDuplicates liftIO $ if mode == Duplicate || mode == SkipDuplicates
then void $ copyFileExternal CopyAllMetaData srcfile destfile then void $ copyFileExternal CopyAllMetaData srcfile destfile
else moveFile srcfile destfile else moveFile srcfile destfile
-- Get the inode cache of the dest file. It should be
-- weakly the same as the origianlly locked down file's
-- inode cache. (Since the file may have been copied,
-- its inodes may not be the same.)
newcache <- withTSDelta $ liftIO . genInodeCache destfile
let unchanged = case (newcache, inodeCache (keySource ld)) of
(_, Nothing) -> True
(Just newc, Just c) | compareWeak c newc -> True
_ -> False
unless unchanged $
giveup "changed while it was being added"
-- The LockedDown needs to be adjusted, since the destfile
-- is what will be ingested.
let ld' = ld
{ keySource = KeySource
{ keyFilename = destfile
, contentLocation = destfile
, inodeCache = newcache
}
}
ifM (checkFileMatcher largematcher destfile) ifM (checkFileMatcher largematcher destfile)
( Command.Add.perform destfile ( ingestAdd' (Just ld') (Just k)
>>= maybe
stop
(\addedk -> next $ Command.Add.cleanup addedk True)
, next $ Command.Add.addSmall destfile , next $ Command.Add.addSmall destfile
) )
notoverwriting why = do notoverwriting why = do
warning $ "not overwriting existing " ++ destfile ++ " " ++ why warning $ "not overwriting existing " ++ destfile ++ " " ++ why
stop stop
checkdup dupa notdupa = do lockdown a = do
backend <- chooseBackend destfile lockingfile <- not <$> addUnlocked
let ks = KeySource srcfile srcfile Nothing -- Minimal lock down with no hard linking so nothing
v <- genKey ks backend -- has to be done to clean up from it.
let cfg = LockDownConfig
{ lockingFile = lockingfile
, hardlinkFileTmp = False
}
v <- lockDown cfg srcfile
case v of case v of
Just (k, _) -> ifM (isKnownKey k) Just ld -> do
( return (maybe Nothing (\a -> Just (a k)) dupa) backend <- chooseBackend destfile
, return notdupa v' <- genKey (keySource ld) backend
case v' of
Just (k, _) -> a (ld, k)
Nothing -> giveup "failed to generate a key"
Nothing -> stop
checkdup k dupa notdupa = ifM (isKnownKey k)
( dupa
, notdupa
) )
_ -> return notdupa pickaction = lockdown $ \(ld, k) -> case mode of
pickaction = case mode of DeDuplicate -> checkdup k (deletedup k) (importfile ld k)
DeDuplicate -> checkdup (Just deletedup) (Just importfile) CleanDuplicates -> checkdup k
CleanDuplicates -> checkdup (Just deletedup) Nothing (deletedup k)
SkipDuplicates -> checkdup Nothing (Just importfile) (skipbecause "not duplicate")
_ -> return (Just importfile) SkipDuplicates -> checkdup k
(skipbecause "duplicate")
(importfile ld k)
_ -> importfile ld k
skipbecause s = showNote (s ++ "; skipping") >> next (return True)
verifyExisting :: Key -> FilePath -> (CommandPerform, CommandPerform) -> CommandPerform verifyExisting :: Key -> FilePath -> (CommandPerform, CommandPerform) -> CommandPerform
verifyExisting key destfile (yes, no) = do verifyExisting key destfile (yes, no) = do

View file

@ -88,7 +88,7 @@ clean file = do
<$> catKeyFile file <$> catKeyFile file
liftIO . emitPointer liftIO . emitPointer
=<< go =<< go
=<< ingest' currbackend =<< (\ld -> ingest' currbackend ld Nothing)
=<< lockDown cfg file =<< lockDown cfg file
, liftIO $ B.hPut stdout b , liftIO $ B.hPut stdout b
) )