Make import --deduplicate and --skip-duplicates only hash once, not twice

import: --deduplicate and --skip-duplicates were implemented inneficiently;
they unncessarily hashed each file twice. They have been improved to only
hash once.

The new approach is to lock down (minimally) and hash files, and then
reuse that information when importing them.

This was rather tricky, especially in detecting changes to files while
they are being imported.

The output of import changed slightly. While before it silently skipped
over files with eg --skip-duplicates, now it shows each file as it starts
to act on it. Since every file is hashed first thing, it would otherwise
not be clear what file import is chewing on. (Actually, it wasn't clear
before when any of the duplicates switches were used.)

This commit was sponsored by Alexander Thompson on Patreon.
This commit is contained in:
Joey Hess 2017-02-09 15:32:22 -04:00
parent 30ab4ecc4b
commit f617988a29
No known key found for this signature in database
GPG key ID: C910D9222512E3C7
5 changed files with 90 additions and 41 deletions

View file

@ -1,6 +1,6 @@
{- git-annex content ingestion
-
- Copyright 2010-2016 Joey Hess <id@joeyh.name>
- Copyright 2010-2017 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -10,6 +10,7 @@ module Annex.Ingest (
LockDownConfig(..),
lockDown,
ingestAdd,
ingestAdd',
ingest,
ingest',
finishIngestDirect,
@ -116,10 +117,13 @@ lockDown' cfg file = ifM (pure (not (hardlinkFileTmp cfg)) <||> crippledFileSyst
{- Ingests a locked down file into the annex. Updates the work tree and
- index. -}
ingestAdd :: Maybe LockedDown -> Annex (Maybe Key)
ingestAdd Nothing = return Nothing
ingestAdd ld@(Just (LockedDown cfg source)) = do
(mk, mic) <- ingest ld
case mk of
ingestAdd ld = ingestAdd' ld Nothing
ingestAdd' :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key)
ingestAdd' Nothing _ = return Nothing
ingestAdd' ld@(Just (LockedDown cfg source)) mk = do
(mk', mic) <- ingest ld mk
case mk' of
Nothing -> return Nothing
Just k -> do
let f = keyFilename source
@ -140,14 +144,17 @@ ingestAdd ld@(Just (LockedDown cfg source)) = do
{- Ingests a locked down file into the annex. Does not update the working
- tree or the index.
-}
ingest :: Maybe LockedDown -> Annex (Maybe Key, Maybe InodeCache)
ingest :: Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache)
ingest = ingest' Nothing
ingest' :: Maybe Backend -> Maybe LockedDown -> Annex (Maybe Key, Maybe InodeCache)
ingest' _ Nothing = return (Nothing, Nothing)
ingest' preferredbackend (Just (LockedDown cfg source)) = withTSDelta $ \delta -> do
backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend
k <- genKey source backend
ingest' :: Maybe Backend -> Maybe LockedDown -> Maybe Key -> Annex (Maybe Key, Maybe InodeCache)
ingest' _ Nothing _ = return (Nothing, Nothing)
ingest' preferredbackend (Just (LockedDown cfg source)) mk = withTSDelta $ \delta -> do
k <- case mk of
Nothing -> do
backend <- maybe (chooseBackend $ keyFilename source) (return . Just) preferredbackend
fmap fst <$> genKey source backend
Just k -> return (Just k)
let src = contentLocation source
ms <- liftIO $ catchMaybeIO $ getFileStatus src
mcache <- maybe (pure Nothing) (liftIO . toInodeCache delta src) ms
@ -156,7 +163,7 @@ ingest' preferredbackend (Just (LockedDown cfg source)) = withTSDelta $ \delta -
(Just newc, Just c) | compareStrong c newc -> go k mcache ms
_ -> failure "changed while it was being added"
where
go (Just (key, _)) mcache (Just s)
go (Just key) mcache (Just s)
| lockingFile cfg = golocked key mcache s
| otherwise = ifM isDirect
( godirect key mcache s