use InodeCache to avoid races in import from directory special remote

This does not avoid all possible races, but it does avoid all likely
ones, and is demonstratably better than git's own handling of races
where files get modified at the same time as it's updating the working
tree.

The main thing this won't detect are not unlikely races where part
of a file gets changed while it's being copied and then the file is
restored to its original condition before the modification check.
No, it's more likely that the limitations of checking inode, size,
and mtime won't detect certian modifications, involving eg mmapped
files.
This commit is contained in:
Joey Hess 2019-03-04 13:20:58 -04:00
parent 51fc969b66
commit 3cd19fb4d0
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 91 additions and 28 deletions

View file

@ -233,12 +233,14 @@ downloadImport remote importtreeconfig importablecontents = do
(k:_) -> return $ Just (loc, k)
[] -> checkDiskSpaceToGet tmpkey Nothing $
withTmp tmpkey $ \tmpfile ->
Remote.retrieveExportWithContentIdentifier ia loc cid tmpfile (ingestkey loc tmpfile) p >>= \case
Just k -> do
recordcidkey cidmap db cid k
logStatus k InfoPresent
logChange k (Remote.uuid remote) InfoPresent
return $ Just (loc, k)
Remote.retrieveExportWithContentIdentifier ia loc cid tmpfile (mkkey loc tmpfile) p >>= \case
Just k -> tryNonAsync (moveAnnex k tmpfile) >>= \case
Right True -> do
recordcidkey cidmap db cid k
logStatus k InfoPresent
logChange k (Remote.uuid remote) InfoPresent
return $ Just (loc, k)
_ -> return Nothing
Nothing -> return Nothing
where
-- TODO progress bar
@ -246,7 +248,7 @@ downloadImport remote importtreeconfig importablecontents = do
ia = Remote.importActions remote
tmpkey = importKey cid sz
ingestkey loc tmpfile = do
mkkey loc tmpfile = do
f <- fromRepo $ fromTopFilePath $ locworktreefilename loc
backend <- chooseBackend f
let ks = KeySource
@ -254,12 +256,7 @@ downloadImport remote importtreeconfig importablecontents = do
, contentLocation = tmpfile
, inodeCache = Nothing
}
genKey ks backend >>= \case
Nothing -> return Nothing
Just (k, _) ->
tryNonAsync (moveAnnex k tmpfile) >>= \case
Right True -> return (Just k)
_ -> return Nothing
fmap fst <$> genKey ks backend
locworktreefilename loc = asTopFilePath $ case importtreeconfig of
ImportTree -> fromImportLocation loc

View file

@ -33,6 +33,7 @@ import Annex.Content
import Annex.UUID
import Utility.Metered
import Utility.Tmp
import Utility.InodeCache
remote :: RemoteType
remote = RemoteType
@ -300,22 +301,90 @@ listImportableContentsM :: FilePath -> Annex (Maybe (ImportableContents (Content
listImportableContentsM dir = catchMaybeIO $ liftIO $ do
l <- dirContentsRecursive dir
l' <- mapM go l
return $ ImportableContents l' []
return $ ImportableContents (catMaybes l') []
where
go f = do
sz <- getFileSize f
loc <- mkImportLocation <$> relPathDirToFile dir f
-- TODO use inode, size etc
let cid = ContentIdentifier $ encodeBS f
return (loc, (cid, sz))
st <- getFileStatus f
mkContentIdentifier f st >>= \case
Nothing -> return Nothing
Just cid -> do
relf <- relPathDirToFile dir f
sz <- getFileSize' f st
return $ Just (mkImportLocation relf, (cid, sz))
-- Make a ContentIdentifier that contains an InodeCache.
--
-- The InodeCache is generated without checking a sentinal file.
-- So in a case when a remount etc causes all the inodes to change,
-- files may appear to be modified when they are not, which will only
-- result in extra work to re-import them.
--
-- If the file is not a regular file, this will return Nothing.
mkContentIdentifier :: FilePath -> FileStatus -> IO (Maybe ContentIdentifier)
mkContentIdentifier f st =
fmap (ContentIdentifier . encodeBS . showInodeCache)
<$> toInodeCache noTSDelta f st
retrieveExportWithContentIdentifierM :: FilePath -> ExportLocation -> ContentIdentifier -> FilePath -> Annex (Maybe Key) -> MeterUpdate -> Annex (Maybe Key)
retrieveExportWithContentIdentifierM dir loc cid dest mkkey p = catchDefaultIO Nothing $ do
-- TODO check the ContentIdentifier is valid (avoiding all races)
let f = dir </> fromExportLocation loc
h <- liftIO $ openBinaryFile f ReadMode
liftIO $ hGetContentsMetered h p >>= L.writeFile dest
mkkey
retrieveExportWithContentIdentifierM dir loc cid dest mkkey p =
catchDefaultIO Nothing $ precheck $ docopy postcheck
where
f = dir </> fromExportLocation loc
docopy cont = do
#ifndef mingw32_HOST_OS
-- Need a duplicate fd for the post check, since
-- hGetContentsMetered closes its handle.
fd <- liftIO $ openFd f ReadOnly Nothing defaultFileFlags
dupfd <- liftIO $ dup fd
h <- liftIO $ fdToHandle fd
#else
h <- liftIO $ openBinaryFile f ReadMode
#endif
liftIO $ hGetContentsMetered h p >>= L.writeFile dest
k <- mkkey
#ifndef mingw32_HOST_OS
cont dupfd (return k)
#else
cont (return k)
#endif
-- Check before copy, to avoid expensive copy of wrong file
-- content.
precheck cont = comparecid cont
=<< liftIO . mkContentIdentifier f
=<< liftIO (getFileStatus f)
-- Check after copy, in case the file was changed while it was
-- being copied.
--
-- When possible (not on Windows), check the same handle
-- Check the same handle that the file was copied from.
-- Avoids some race cases where the file is modified while
-- it's copied but then gets restored to the original content
-- afterwards.
--
-- This does not guard against every possible race, but neither
-- can InodeCaches detect every possible modification to a file.
-- It's probably as good or better than git's handling of similar
-- situations with files being modified while it's updating the
-- working tree for a merge.
#ifndef mingw32_HOST_OS
postcheck fd cont = do
#else
postcheck cont = do
#endif
currcid <- liftIO $ mkContentIdentifier f
#ifndef mingw32_HOST_OS
=<< getFdStatus fd
#else
=<< getFileStatus f
#endif
comparecid cont currcid
comparecid cont currcid
| currcid == Just cid = cont
| otherwise = return Nothing
storeExportWithContentIdentifierM :: FilePath -> FilePath -> Key -> ExportLocation -> [ContentIdentifier] -> MeterUpdate -> Annex (Maybe ContentIdentifier)
storeExportWithContentIdentifierM dir = error "TODO"

View file

@ -10,9 +10,6 @@ this.
## implementation notes
* directory special remote import is prototype, does not notice modified
files, not race safe
* Need to support annex-tracking-branch configuration, which documentation
says makes git-annex sync and assistant do imports.