CoW probing
Improved probing when CoW copies can be made between files on the same drive. Now supports CoW between BTRFS subvolumes. And, falls back to rsync instead of using cp when CoW won't work, eg copies between repos on the same EXT4 filesystem. Rather than trying cp --reflink=always for each file copied to a remote, it's tried once and if it fails it falls back to using rsync thereafter for the lifetime of the Remote object. That avoids overhead of calling cp which while small, will add up over a large number of files. This commit was sponsored by Boyd Stephen Smith Jr. on Patreon.
This commit is contained in:
parent
0dc26cd6f1
commit
21ff5e1e5a
6 changed files with 100 additions and 37 deletions
Build
CHANGELOGRemote
Utility
doc/bugs
Uses_rsync_instead_of___96__cp_--reflink__61__auto__96___on_volumes_of_the_same_BTRFS_partition.mdwn
Uses_rsync_instead_of___96__cp_--reflink__61__auto__96___on_volumes_of_the_same_BTRFS_partition
|
@ -23,7 +23,7 @@ tests =
|
|||
, testCp "cp_a" "-a"
|
||||
, testCp "cp_p" "-p"
|
||||
, testCp "cp_preserve_timestamps" "--preserve=timestamps"
|
||||
, testCp "cp_reflink_auto" "--reflink=auto"
|
||||
, testCp "cp_reflink_supported" "--reflink=auto"
|
||||
, TestCase "xargs -0" $ testCmd "xargs_0" "xargs -0 </dev/null"
|
||||
, TestCase "rsync" $ testCmd "rsync" "rsync --version >/dev/null"
|
||||
, TestCase "curl" $ testCmd "curl" "curl --version >/dev/null"
|
||||
|
|
|
@ -8,6 +8,10 @@ git-annex (7.20190709) UNRELEASED; urgency=medium
|
|||
optimised for 4-way CPUs.
|
||||
* Support running v7 upgrade in a repo where there is no branch checked
|
||||
out, but HEAD is set directly to some other ref.
|
||||
* Improved probing when CoW copies can be made between files on the same
|
||||
drive. Now supports CoW between BTRFS subvolumes. And, falls back to rsync
|
||||
instead of using cp when CoW won't work, eg copies between repos on the
|
||||
same EXT4 filesystem.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 08 Jul 2019 08:59:54 -0400
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{- Standard git remotes.
|
||||
-
|
||||
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- Licensed under the GNU AGPL version 3 or higher.
|
||||
-}
|
||||
|
@ -335,7 +335,7 @@ inAnnex rmt st key = do
|
|||
inAnnex' repo rmt st key
|
||||
|
||||
inAnnex' :: Git.Repo -> Remote -> State -> Key -> Annex Bool
|
||||
inAnnex' repo rmt (State connpool duc _) key
|
||||
inAnnex' repo rmt (State connpool duc _ _) key
|
||||
| Git.repoIsHttp repo = checkhttp
|
||||
| Git.repoIsUrl repo = checkremote
|
||||
| otherwise = checklocal
|
||||
|
@ -382,7 +382,7 @@ dropKey r st key = do
|
|||
(\e -> warning (show e) >> return False)
|
||||
|
||||
dropKey' :: Git.Repo -> Remote -> State -> Key -> Annex Bool
|
||||
dropKey' repo r (State connpool duc _) key
|
||||
dropKey' repo r (State connpool duc _ _) key
|
||||
| not $ Git.repoIsUrl repo = ifM duc
|
||||
( guardUsable repo (return False) $
|
||||
commitOnCleanup repo r $ onLocalFast repo r $ do
|
||||
|
@ -406,7 +406,7 @@ lockKey r st key callback = do
|
|||
lockKey' repo r st key callback
|
||||
|
||||
lockKey' :: Git.Repo -> Remote -> State -> Key -> (VerifiedCopy -> Annex r) -> Annex r
|
||||
lockKey' repo r (State connpool duc _) key callback
|
||||
lockKey' repo r (State connpool duc _ _) key callback
|
||||
| not $ Git.repoIsUrl repo = ifM duc
|
||||
( guardUsable repo failedlock $ do
|
||||
inorigrepo <- Annex.makeRunner
|
||||
|
@ -474,7 +474,7 @@ copyFromRemote' forcersync r st key file dest meterupdate = do
|
|||
copyFromRemote'' repo forcersync r st key file dest meterupdate
|
||||
|
||||
copyFromRemote'' :: Git.Repo -> Bool -> Remote -> State -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex (Bool, Verification)
|
||||
copyFromRemote'' repo forcersync r (State connpool _ _) key file dest meterupdate
|
||||
copyFromRemote'' repo forcersync r st@(State connpool _ _ _) key file dest meterupdate
|
||||
| Git.repoIsHttp repo = unVerified $ do
|
||||
gc <- Annex.getGitConfig
|
||||
Annex.Content.downloadUrl key meterupdate (keyUrls gc repo r key) dest
|
||||
|
@ -489,7 +489,7 @@ copyFromRemote'' repo forcersync r (State connpool _ _) key file dest meterupdat
|
|||
case v of
|
||||
Nothing -> return (False, UnVerified)
|
||||
Just (object, checksuccess) -> do
|
||||
copier <- mkCopier hardlink params
|
||||
copier <- mkCopier hardlink st params
|
||||
runTransfer (Transfer Download u key)
|
||||
file stdRetry
|
||||
(\p -> copier object dest (combineMeterUpdate p meterupdate) checksuccess)
|
||||
|
@ -600,7 +600,7 @@ copyToRemote r st key file meterupdate = do
|
|||
copyToRemote' repo r st key file meterupdate
|
||||
|
||||
copyToRemote' :: Git.Repo -> Remote -> State -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool
|
||||
copyToRemote' repo r (State connpool duc _) key file meterupdate
|
||||
copyToRemote' repo r st@(State connpool duc _ _) key file meterupdate
|
||||
| not $ Git.repoIsUrl repo = ifM duc
|
||||
( guardUsable repo (return False) $ commitOnCleanup repo r $
|
||||
copylocal =<< Annex.Content.prepSendAnnex key
|
||||
|
@ -627,7 +627,7 @@ copyToRemote' repo r (State connpool duc _) key file meterupdate
|
|||
( return True
|
||||
, do
|
||||
ensureInitialized
|
||||
copier <- mkCopier hardlink params
|
||||
copier <- mkCopier hardlink st params
|
||||
let verify = Annex.Content.RemoteVerify r
|
||||
let rsp = RetrievalAllKeysSecure
|
||||
runTransfer (Transfer Download u key) file stdRetry $ \p ->
|
||||
|
@ -704,27 +704,46 @@ onLocal repo r a = do
|
|||
onLocalFast :: Git.Repo -> Remote -> Annex a -> Annex a
|
||||
onLocalFast repo r a = onLocal repo r $ Annex.BranchState.disableUpdate >> a
|
||||
|
||||
{- Copys a file with rsync unless both locations are on the same
|
||||
- filesystem. Then cp could be faster. -}
|
||||
rsyncOrCopyFile :: [CommandParam] -> FilePath -> FilePath -> MeterUpdate -> Annex Bool
|
||||
rsyncOrCopyFile rsyncparams src dest p =
|
||||
-- To avoid the overhead of trying copy-on-write every time, it's tried
|
||||
-- once and if it fails, is not tried again.
|
||||
newtype CopyCoWTried = CopyCoWTried (MVar Bool)
|
||||
|
||||
newCopyCoWTried :: IO CopyCoWTried
|
||||
newCopyCoWTried = CopyCoWTried <$> newEmptyMVar
|
||||
|
||||
{- Copys a file. Uses copy-on-write if it is supported. Otherwise,
|
||||
- uses rsync, so that interrupted copies can be resumed. -}
|
||||
rsyncOrCopyFile :: State -> [CommandParam] -> FilePath -> FilePath -> MeterUpdate -> Annex Bool
|
||||
rsyncOrCopyFile st rsyncparams src dest p =
|
||||
#ifdef mingw32_HOST_OS
|
||||
-- rsync is only available on Windows in some inatallation methods,
|
||||
-- rsync is only available on Windows in some installation methods,
|
||||
-- and is not strictly needed here, so don't use it.
|
||||
docopy
|
||||
docopywith copyFileExternal
|
||||
where
|
||||
#else
|
||||
ifM (sameDeviceIds src dest) (docopy, dorsync)
|
||||
-- If multiple threads reach this at the same time, they
|
||||
-- will both try CoW, which is acceptable.
|
||||
ifM (liftIO $ isEmptyMVar copycowtried)
|
||||
( do
|
||||
ok <- docopycow
|
||||
void $ liftIO $ tryPutMVar copycowtried ok
|
||||
pure ok <||> dorsync
|
||||
, ifM (liftIO $ readMVar copycowtried)
|
||||
( docopycow <||> dorsync
|
||||
, dorsync
|
||||
)
|
||||
)
|
||||
where
|
||||
sameDeviceIds a b = (==) <$> getDeviceId a <*> getDeviceId b
|
||||
getDeviceId f = deviceID <$> liftIO (getFileStatus $ parentDir f)
|
||||
copycowtried = case st of
|
||||
State _ _ (CopyCoWTried v) _ -> v
|
||||
dorsync = do
|
||||
oh <- mkOutputHandler
|
||||
Ssh.rsyncHelper oh (Just p) $
|
||||
rsyncparams ++ [File src, File dest]
|
||||
docopycow = docopywith copyCoW
|
||||
#endif
|
||||
docopy = liftIO $ watchFileSize dest p $
|
||||
copyFileExternal CopyTimeStamps src dest
|
||||
docopywith a = liftIO $ watchFileSize dest p $
|
||||
a CopyTimeStamps src dest
|
||||
|
||||
commitOnCleanup :: Git.Repo -> Remote -> Annex a -> Annex a
|
||||
commitOnCleanup repo r a = go `after` a
|
||||
|
@ -768,10 +787,10 @@ wantHardLink = (annexHardLink <$> Annex.getGitConfig)
|
|||
-- done.
|
||||
type Copier = FilePath -> FilePath -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification)
|
||||
|
||||
mkCopier :: Bool -> [CommandParam] -> Annex Copier
|
||||
mkCopier remotewanthardlink rsyncparams = do
|
||||
mkCopier :: Bool -> State -> [CommandParam] -> Annex Copier
|
||||
mkCopier remotewanthardlink st rsyncparams = do
|
||||
let copier = \src dest p check -> unVerified $
|
||||
rsyncOrCopyFile rsyncparams src dest p <&&> check
|
||||
rsyncOrCopyFile st rsyncparams src dest p <&&> check
|
||||
localwanthardlink <- wantHardLink
|
||||
let linker = \src dest -> createLink src dest >> return True
|
||||
ifM (pure (remotewanthardlink || localwanthardlink) <&&> not <$> isDirect)
|
||||
|
@ -790,20 +809,21 @@ mkCopier remotewanthardlink rsyncparams = do
|
|||
- This returns False when the repository UUID is not as expected. -}
|
||||
type DeferredUUIDCheck = Annex Bool
|
||||
|
||||
data State = State Ssh.P2PSshConnectionPool DeferredUUIDCheck (Annex (Git.Repo, GitConfig))
|
||||
data State = State Ssh.P2PSshConnectionPool DeferredUUIDCheck CopyCoWTried (Annex (Git.Repo, GitConfig))
|
||||
|
||||
getRepoFromState :: State -> Annex Git.Repo
|
||||
getRepoFromState (State _ _ a) = fst <$> a
|
||||
getRepoFromState (State _ _ _ a) = fst <$> a
|
||||
|
||||
{- The config of the remote git repository, cached for speed. -}
|
||||
getGitConfigFromState :: State -> Annex GitConfig
|
||||
getGitConfigFromState (State _ _ a) = snd <$> a
|
||||
getGitConfigFromState (State _ _ _ a) = snd <$> a
|
||||
|
||||
mkState :: Git.Repo -> UUID -> RemoteGitConfig -> Annex State
|
||||
mkState r u gc = do
|
||||
pool <- Ssh.mkP2PSshConnectionPool
|
||||
copycowtried <- liftIO newCopyCoWTried
|
||||
(duc, getrepo) <- go
|
||||
return $ State pool duc getrepo
|
||||
return $ State pool duc copycowtried getrepo
|
||||
where
|
||||
go
|
||||
| remoteAnnexCheckUUID gc = return
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
{- file copying
|
||||
-
|
||||
- Copyright 2010-2014 Joey Hess <id@joeyh.name>
|
||||
- Copyright 2010-2019 Joey Hess <id@joeyh.name>
|
||||
-
|
||||
- License: BSD-2-clause
|
||||
-}
|
||||
|
||||
module Utility.CopyFile (
|
||||
copyFileExternal,
|
||||
copyCoW,
|
||||
createLinkOrCopy,
|
||||
CopyMetaData(..)
|
||||
) where
|
||||
|
@ -22,6 +23,17 @@ data CopyMetaData
|
|||
| CopyAllMetaData
|
||||
deriving (Eq)
|
||||
|
||||
copyMetaDataParams :: CopyMetaData -> [CommandParam]
|
||||
copyMetaDataParams meta = map snd $ filter fst
|
||||
[ (allmeta && BuildInfo.cp_a, Param "-a")
|
||||
, (allmeta && BuildInfo.cp_p && not BuildInfo.cp_a
|
||||
, Param "-p")
|
||||
, (not allmeta && BuildInfo.cp_preserve_timestamps
|
||||
, Param "--preserve=timestamps")
|
||||
]
|
||||
where
|
||||
allmeta = meta == CopyAllMetaData
|
||||
|
||||
{- The cp command is used, because I hate reinventing the wheel,
|
||||
- and because this allows easy access to features like cp --reflink. -}
|
||||
copyFileExternal :: CopyMetaData -> FilePath -> FilePath -> IO Bool
|
||||
|
@ -30,15 +42,33 @@ copyFileExternal meta src dest = do
|
|||
removeFile dest
|
||||
boolSystem "cp" $ params ++ [File src, File dest]
|
||||
where
|
||||
params = map snd $ filter fst
|
||||
[ (BuildInfo.cp_reflink_auto, Param "--reflink=auto")
|
||||
, (allmeta && BuildInfo.cp_a, Param "-a")
|
||||
, (allmeta && BuildInfo.cp_p && not BuildInfo.cp_a
|
||||
, Param "-p")
|
||||
, (not allmeta && BuildInfo.cp_preserve_timestamps
|
||||
, Param "--preserve=timestamps")
|
||||
]
|
||||
allmeta = meta == CopyAllMetaData
|
||||
params
|
||||
| BuildInfo.cp_reflink_supported =
|
||||
Param "--reflink=auto" : copyMetaDataParams meta
|
||||
| otherwise = copyMetaDataParams meta
|
||||
|
||||
{- When a filesystem supports CoW (and cp does), uses it to make
|
||||
- an efficient copy of a file. Otherwise, returns False. -}
|
||||
copyCoW :: CopyMetaData -> FilePath -> FilePath -> IO Bool
|
||||
copyCoW meta src dest
|
||||
| BuildInfo.cp_reflink_supported = do
|
||||
whenM (doesFileExist dest) $
|
||||
removeFile dest
|
||||
-- When CoW is not supported, cp will complain to stderr,
|
||||
-- so have to discard its stderr.
|
||||
ok <- catchBoolIO $ do
|
||||
withQuietOutput createProcessSuccess $
|
||||
proc "cp" $ toCommand $
|
||||
params ++ [File src, File dest]
|
||||
return True
|
||||
-- When CoW is not supported, cp creates the destination
|
||||
-- file but leaves it empty.
|
||||
unless ok $
|
||||
void $ tryIO $ removeFile dest
|
||||
return ok
|
||||
| otherwise = return False
|
||||
where
|
||||
params = Param "--reflink=always" : copyMetaDataParams meta
|
||||
|
||||
{- Create a hard link if the filesystem allows it, and fall back to copying
|
||||
- the file. -}
|
||||
|
|
|
@ -17,3 +17,5 @@ If there is some generic benefit from `rsync`, could it may be at least be a con
|
|||
|
||||
|
||||
[[!meta author="yoh"]]
|
||||
|
||||
> [[done]] --[[Joey]]
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 2"""
|
||||
date="2019-07-17T18:13:17Z"
|
||||
content="""
|
||||
CoW probing implemented
|
||||
"""]]
|
Loading…
Add table
Reference in a new issue