stop using rsync in fileCopier

This is groundwork for calculating checksums while copying, rather than
in a separate pass, but that's not done yet. For now, avoid using rsync
(and cp on Windows), and instead read and write the file ourselves, with
resume handling.

Benchmarking vs old git-annex that used rsync, this is faster,
at least once the file size is larger than a couple of MB.
This commit is contained in:
Joey Hess 2021-02-10 14:44:35 -04:00
parent c4c9b99e22
commit 48f63c2798
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38

View file

@ -1,6 +1,6 @@
{- Standard git remotes. {- Standard git remotes.
- -
- Copyright 2011-2020 Joey Hess <id@joeyh.name> - Copyright 2011-2021 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -48,6 +48,7 @@ import Utility.CopyFile
import Utility.Env import Utility.Env
import Utility.Batch import Utility.Batch
import Utility.SimpleProtocol import Utility.SimpleProtocol
import Utility.Touch
import Remote.Helper.Git import Remote.Helper.Git
import Remote.Helper.Messages import Remote.Helper.Messages
import Remote.Helper.ExportImport import Remote.Helper.ExportImport
@ -73,6 +74,7 @@ import Control.Concurrent
import Control.Concurrent.MSampleVar import Control.Concurrent.MSampleVar
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.ByteString as S import qualified Data.ByteString as S
import Data.Time.Clock.POSIX
import Network.URI import Network.URI
remote :: RemoteType remote :: RemoteType
@ -535,13 +537,12 @@ copyFromRemote'' repo forcersync r st@(State connpool _ _ _ _) key file dest met
giveup "failed to download content" giveup "failed to download content"
return UnVerified return UnVerified
| not $ Git.repoIsUrl repo = guardUsable repo (giveup "cannot access remote") $ do | not $ Git.repoIsUrl repo = guardUsable repo (giveup "cannot access remote") $ do
params <- Ssh.rsyncParams r Download
u <- getUUID u <- getUUID
hardlink <- wantHardLink hardlink <- wantHardLink
-- run copy from perspective of remote -- run copy from perspective of remote
onLocalFast st $ Annex.Content.prepSendAnnex key >>= \case onLocalFast st $ Annex.Content.prepSendAnnex key >>= \case
Just (object, checksuccess) -> do Just (object, checksuccess) -> do
copier <- mkCopier hardlink st params copier <- mkCopier hardlink st
(ok, v) <- runTransfer (Transfer Download u (fromKey id key)) (ok, v) <- runTransfer (Transfer Download u (fromKey id key))
file Nothing stdRetry $ \p -> file Nothing stdRetry $ \p ->
metered (Just (combineMeterUpdate p meterupdate)) key $ \_ p' -> metered (Just (combineMeterUpdate p meterupdate)) key $ \_ p' ->
@ -678,14 +679,13 @@ copyToRemote' repo r st@(State connpool duc _ _ _) key file meterupdate
-- the remote's Annex, but it needs access to the local -- the remote's Annex, but it needs access to the local
-- Annex monad's state. -- Annex monad's state.
checksuccessio <- Annex.withCurrentState checksuccess checksuccessio <- Annex.withCurrentState checksuccess
params <- Ssh.rsyncParams r Upload
u <- getUUID u <- getUUID
hardlink <- wantHardLink hardlink <- wantHardLink
-- run copy from perspective of remote -- run copy from perspective of remote
res <- onLocalFast st $ ifM (Annex.Content.inAnnex key) res <- onLocalFast st $ ifM (Annex.Content.inAnnex key)
( return True ( return True
, runTransfer (Transfer Download u (fromKey id key)) file Nothing stdRetry $ \p -> do , runTransfer (Transfer Download u (fromKey id key)) file Nothing stdRetry $ \p -> do
copier <- mkCopier hardlink st params copier <- mkCopier hardlink st
let verify = Annex.Content.RemoteVerify r let verify = Annex.Content.RemoteVerify r
let rsp = RetrievalAllKeysSecure let rsp = RetrievalAllKeysSecure
res <- logStatusAfter key $ Annex.Content.getViaTmp rsp verify key file $ \dest -> res <- logStatusAfter key $ Annex.Content.getViaTmp rsp verify key file $ \dest ->
@ -827,9 +827,9 @@ wantHardLink = (annexHardLink <$> Annex.getGitConfig)
-- copying it. -- copying it.
type Copier = FilePath -> FilePath -> Key -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification) type Copier = FilePath -> FilePath -> Key -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification)
mkCopier :: Bool -> State -> [CommandParam] -> Annex Copier mkCopier :: Bool -> State -> Annex Copier
mkCopier remotewanthardlink st rsyncparams = do mkCopier remotewanthardlink st = do
let copier = fileCopier st rsyncparams let copier = fileCopier st
localwanthardlink <- wantHardLink localwanthardlink <- wantHardLink
let linker = \src dest -> createLink src dest >> return True let linker = \src dest -> createLink src dest >> return True
if remotewanthardlink || localwanthardlink if remotewanthardlink || localwanthardlink
@ -906,16 +906,25 @@ newCopyCoWTried :: IO CopyCoWTried
newCopyCoWTried = CopyCoWTried <$> newEmptyMVar newCopyCoWTried = CopyCoWTried <$> newEmptyMVar
{- Copys a file. Uses copy-on-write if it is supported. Otherwise, {- Copys a file. Uses copy-on-write if it is supported. Otherwise,
- uses rsync, so that interrupted copies can be resumed. -} - copies the file itself. If the destination already exists,
fileCopier :: State -> [CommandParam] -> Copier - an interruped copy will resume where it left off.
-
- When copy-on-write is used, returns UnVerified, because the content of
- the file has not been verified to be correct. When the file has to be
- read to copy it, a hash is calulated at the same time.
-
- Note that, when the destination file already exists, it's read both
- to start calculating the hash, and also to verify that its content is
- the same as the start of the source file. It's possible that the
- destination file was created from some other source file,
- (eg when isStableKey is false), and doing this avoids getting a
- corrupted file in such cases.
-}
fileCopier :: State -> Copier
#ifdef mingw32_HOST_OS #ifdef mingw32_HOST_OS
rsyncOrCopyFile _st _rsyncparams src dest k p check = fileCopier _st src dest k meterupdate check = docopy
-- rsync is only available on Windows in some installation methods,
-- and is not strictly needed here, so don't use it.
unVerified $ docopywith copyFileExternal <&&> check
where
#else #else
fileCopier st rsyncparams src dest _k p check = fileCopier st src dest k meterupdate check =
-- If multiple threads reach this at the same time, they -- If multiple threads reach this at the same time, they
-- will both try CoW, which is acceptable. -- will both try CoW, which is acceptable.
ifM (liftIO $ isEmptyMVar copycowtried) ifM (liftIO $ isEmptyMVar copycowtried)
@ -924,26 +933,83 @@ fileCopier st rsyncparams src dest _k p check =
void $ liftIO $ tryPutMVar copycowtried ok void $ liftIO $ tryPutMVar copycowtried ok
if ok if ok
then unVerified check then unVerified check
else unVerified $ dorsync <&&> check else docopy
, ifM (liftIO $ readMVar copycowtried) , ifM (liftIO $ readMVar copycowtried)
( do ( do
ok <- docopycow ok <- docopycow
if ok if ok
then unVerified check then unVerified check
else unVerified $ dorsync <&&> check else docopy
, unVerified dorsync , docopy
) )
) )
where where
copycowtried = case st of copycowtried = case st of
State _ _ (CopyCoWTried v) _ _ -> v State _ _ (CopyCoWTried v) _ _ -> v
dorsync = do docopycow = liftIO $ watchFileSize dest meterupdate $
-- dest may already exist, so make sure rsync can write to it copyCoW CopyTimeStamps src dest
void $ liftIO $ tryIO $ allowWrite (toRawFilePath dest)
oh <- mkOutputHandlerQuiet
Ssh.rsyncHelper oh (Just p) $
rsyncparams ++ [File src, File dest]
docopycow = docopywith copyCoW
#endif #endif
docopywith a = liftIO $ watchFileSize dest p $
a CopyTimeStamps src dest dest' = toRawFilePath dest
docopy = do
-- The file might have had the write bit removed,
-- so make sure we can write to it.
void $ liftIO $ tryIO $ allowWrite dest'
liftIO $ withBinaryFile dest ReadWriteMode $ \hdest ->
withBinaryFile src ReadMode $ \hsrc -> do
sofar <- compareexisting hdest hsrc zeroBytesProcessed
docopy' hdest hsrc sofar
-- Copy src mode and mtime.
mode <- liftIO $ fileMode <$> getFileStatus src
mtime <- liftIO $ utcTimeToPOSIXSeconds <$> getModificationTime src
liftIO $ setFileMode dest mode
liftIO $ touch dest' mtime False
ifM check
( return (True, UnVerified)
, return (False, UnVerified)
)
docopy' hdest hsrc sofar = do
s <- S.hGet hsrc defaultChunkSize
if s == S.empty
then return ()
else do
let sofar' = addBytesProcessed sofar (S.length s)
S.hPut hdest s
meterupdate sofar'
docopy' hdest hsrc sofar'
-- Leaves hdest and hsrc seeked to wherever the two diverge,
-- so typically hdest will be seeked to end, and hsrc to the same
-- position.
compareexisting hdest hsrc sofar = do
s <- S.hGet hdest defaultChunkSize
if s == S.empty
then return sofar
else do
s' <- getnoshort (S.length s) hsrc
if s == s'
then do
let sofar' = addBytesProcessed sofar (S.length s)
meterupdate sofar'
compareexisting hdest hsrc sofar'
else do
seekbefore hdest s
seekbefore hsrc s'
return sofar
seekbefore h s = hSeek h RelativeSeek (fromIntegral (-1*S.length s))
-- Like hGet, but never returns less than the requested number of
-- bytes, unless it reaches EOF.
getnoshort n h = do
s <- S.hGet h n
if S.length s == n || S.empty == s
then return s
else do
s' <- getnoshort (n - S.length s) h
return (s <> s')