CoW probing

Improved probing when CoW copies can be made between files on the same
drive. Now supports CoW between BTRFS subvolumes. And, falls back to rsync
instead of using cp when CoW won't work, eg copies between repos on the
same EXT4 filesystem.

Rather than trying cp --reflink=always for each file copied to a remote,
it's tried once and if it fails it falls back to using rsync thereafter
for the lifetime of the Remote object. That avoids overhead of calling cp
which while small, will add up over a large number of files.

This commit was sponsored by Boyd Stephen Smith Jr. on Patreon.
This commit is contained in:
Joey Hess 2019-07-17 14:19:00 -04:00
parent 0dc26cd6f1
commit 21ff5e1e5a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 100 additions and 37 deletions

View file

@ -23,7 +23,7 @@ tests =
, testCp "cp_a" "-a" , testCp "cp_a" "-a"
, testCp "cp_p" "-p" , testCp "cp_p" "-p"
, testCp "cp_preserve_timestamps" "--preserve=timestamps" , testCp "cp_preserve_timestamps" "--preserve=timestamps"
, testCp "cp_reflink_auto" "--reflink=auto" , testCp "cp_reflink_supported" "--reflink=auto"
, TestCase "xargs -0" $ testCmd "xargs_0" "xargs -0 </dev/null" , TestCase "xargs -0" $ testCmd "xargs_0" "xargs -0 </dev/null"
, TestCase "rsync" $ testCmd "rsync" "rsync --version >/dev/null" , TestCase "rsync" $ testCmd "rsync" "rsync --version >/dev/null"
, TestCase "curl" $ testCmd "curl" "curl --version >/dev/null" , TestCase "curl" $ testCmd "curl" "curl --version >/dev/null"

View file

@ -8,6 +8,10 @@ git-annex (7.20190709) UNRELEASED; urgency=medium
optimised for 4-way CPUs. optimised for 4-way CPUs.
* Support running v7 upgrade in a repo where there is no branch checked * Support running v7 upgrade in a repo where there is no branch checked
out, but HEAD is set directly to some other ref. out, but HEAD is set directly to some other ref.
* Improved probing when CoW copies can be made between files on the same
drive. Now supports CoW between BTRFS subvolumes. And, falls back to rsync
instead of using cp when CoW won't work, eg copies between repos on the
same EXT4 filesystem.
-- Joey Hess <id@joeyh.name> Mon, 08 Jul 2019 08:59:54 -0400 -- Joey Hess <id@joeyh.name> Mon, 08 Jul 2019 08:59:54 -0400

View file

@ -1,6 +1,6 @@
{- Standard git remotes. {- Standard git remotes.
- -
- Copyright 2011-2018 Joey Hess <id@joeyh.name> - Copyright 2011-2019 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
@ -335,7 +335,7 @@ inAnnex rmt st key = do
inAnnex' repo rmt st key inAnnex' repo rmt st key
inAnnex' :: Git.Repo -> Remote -> State -> Key -> Annex Bool inAnnex' :: Git.Repo -> Remote -> State -> Key -> Annex Bool
inAnnex' repo rmt (State connpool duc _) key inAnnex' repo rmt (State connpool duc _ _) key
| Git.repoIsHttp repo = checkhttp | Git.repoIsHttp repo = checkhttp
| Git.repoIsUrl repo = checkremote | Git.repoIsUrl repo = checkremote
| otherwise = checklocal | otherwise = checklocal
@ -382,7 +382,7 @@ dropKey r st key = do
(\e -> warning (show e) >> return False) (\e -> warning (show e) >> return False)
dropKey' :: Git.Repo -> Remote -> State -> Key -> Annex Bool dropKey' :: Git.Repo -> Remote -> State -> Key -> Annex Bool
dropKey' repo r (State connpool duc _) key dropKey' repo r (State connpool duc _ _) key
| not $ Git.repoIsUrl repo = ifM duc | not $ Git.repoIsUrl repo = ifM duc
( guardUsable repo (return False) $ ( guardUsable repo (return False) $
commitOnCleanup repo r $ onLocalFast repo r $ do commitOnCleanup repo r $ onLocalFast repo r $ do
@ -406,7 +406,7 @@ lockKey r st key callback = do
lockKey' repo r st key callback lockKey' repo r st key callback
lockKey' :: Git.Repo -> Remote -> State -> Key -> (VerifiedCopy -> Annex r) -> Annex r lockKey' :: Git.Repo -> Remote -> State -> Key -> (VerifiedCopy -> Annex r) -> Annex r
lockKey' repo r (State connpool duc _) key callback lockKey' repo r (State connpool duc _ _) key callback
| not $ Git.repoIsUrl repo = ifM duc | not $ Git.repoIsUrl repo = ifM duc
( guardUsable repo failedlock $ do ( guardUsable repo failedlock $ do
inorigrepo <- Annex.makeRunner inorigrepo <- Annex.makeRunner
@ -474,7 +474,7 @@ copyFromRemote' forcersync r st key file dest meterupdate = do
copyFromRemote'' repo forcersync r st key file dest meterupdate copyFromRemote'' repo forcersync r st key file dest meterupdate
copyFromRemote'' :: Git.Repo -> Bool -> Remote -> State -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex (Bool, Verification) copyFromRemote'' :: Git.Repo -> Bool -> Remote -> State -> Key -> AssociatedFile -> FilePath -> MeterUpdate -> Annex (Bool, Verification)
copyFromRemote'' repo forcersync r (State connpool _ _) key file dest meterupdate copyFromRemote'' repo forcersync r st@(State connpool _ _ _) key file dest meterupdate
| Git.repoIsHttp repo = unVerified $ do | Git.repoIsHttp repo = unVerified $ do
gc <- Annex.getGitConfig gc <- Annex.getGitConfig
Annex.Content.downloadUrl key meterupdate (keyUrls gc repo r key) dest Annex.Content.downloadUrl key meterupdate (keyUrls gc repo r key) dest
@ -489,7 +489,7 @@ copyFromRemote'' repo forcersync r (State connpool _ _) key file dest meterupdat
case v of case v of
Nothing -> return (False, UnVerified) Nothing -> return (False, UnVerified)
Just (object, checksuccess) -> do Just (object, checksuccess) -> do
copier <- mkCopier hardlink params copier <- mkCopier hardlink st params
runTransfer (Transfer Download u key) runTransfer (Transfer Download u key)
file stdRetry file stdRetry
(\p -> copier object dest (combineMeterUpdate p meterupdate) checksuccess) (\p -> copier object dest (combineMeterUpdate p meterupdate) checksuccess)
@ -600,7 +600,7 @@ copyToRemote r st key file meterupdate = do
copyToRemote' repo r st key file meterupdate copyToRemote' repo r st key file meterupdate
copyToRemote' :: Git.Repo -> Remote -> State -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool copyToRemote' :: Git.Repo -> Remote -> State -> Key -> AssociatedFile -> MeterUpdate -> Annex Bool
copyToRemote' repo r (State connpool duc _) key file meterupdate copyToRemote' repo r st@(State connpool duc _ _) key file meterupdate
| not $ Git.repoIsUrl repo = ifM duc | not $ Git.repoIsUrl repo = ifM duc
( guardUsable repo (return False) $ commitOnCleanup repo r $ ( guardUsable repo (return False) $ commitOnCleanup repo r $
copylocal =<< Annex.Content.prepSendAnnex key copylocal =<< Annex.Content.prepSendAnnex key
@ -627,7 +627,7 @@ copyToRemote' repo r (State connpool duc _) key file meterupdate
( return True ( return True
, do , do
ensureInitialized ensureInitialized
copier <- mkCopier hardlink params copier <- mkCopier hardlink st params
let verify = Annex.Content.RemoteVerify r let verify = Annex.Content.RemoteVerify r
let rsp = RetrievalAllKeysSecure let rsp = RetrievalAllKeysSecure
runTransfer (Transfer Download u key) file stdRetry $ \p -> runTransfer (Transfer Download u key) file stdRetry $ \p ->
@ -704,27 +704,46 @@ onLocal repo r a = do
onLocalFast :: Git.Repo -> Remote -> Annex a -> Annex a onLocalFast :: Git.Repo -> Remote -> Annex a -> Annex a
onLocalFast repo r a = onLocal repo r $ Annex.BranchState.disableUpdate >> a onLocalFast repo r a = onLocal repo r $ Annex.BranchState.disableUpdate >> a
{- Copys a file with rsync unless both locations are on the same -- To avoid the overhead of trying copy-on-write every time, it's tried
- filesystem. Then cp could be faster. -} -- once and if it fails, is not tried again.
rsyncOrCopyFile :: [CommandParam] -> FilePath -> FilePath -> MeterUpdate -> Annex Bool newtype CopyCoWTried = CopyCoWTried (MVar Bool)
rsyncOrCopyFile rsyncparams src dest p =
newCopyCoWTried :: IO CopyCoWTried
newCopyCoWTried = CopyCoWTried <$> newEmptyMVar
{- Copys a file. Uses copy-on-write if it is supported. Otherwise,
- uses rsync, so that interrupted copies can be resumed. -}
rsyncOrCopyFile :: State -> [CommandParam] -> FilePath -> FilePath -> MeterUpdate -> Annex Bool
rsyncOrCopyFile st rsyncparams src dest p =
#ifdef mingw32_HOST_OS #ifdef mingw32_HOST_OS
-- rsync is only available on Windows in some inatallation methods, -- rsync is only available on Windows in some installation methods,
-- and is not strictly needed here, so don't use it. -- and is not strictly needed here, so don't use it.
docopy docopywith copyFileExternal
where where
#else #else
ifM (sameDeviceIds src dest) (docopy, dorsync) -- If multiple threads reach this at the same time, they
-- will both try CoW, which is acceptable.
ifM (liftIO $ isEmptyMVar copycowtried)
( do
ok <- docopycow
void $ liftIO $ tryPutMVar copycowtried ok
pure ok <||> dorsync
, ifM (liftIO $ readMVar copycowtried)
( docopycow <||> dorsync
, dorsync
)
)
where where
sameDeviceIds a b = (==) <$> getDeviceId a <*> getDeviceId b copycowtried = case st of
getDeviceId f = deviceID <$> liftIO (getFileStatus $ parentDir f) State _ _ (CopyCoWTried v) _ -> v
dorsync = do dorsync = do
oh <- mkOutputHandler oh <- mkOutputHandler
Ssh.rsyncHelper oh (Just p) $ Ssh.rsyncHelper oh (Just p) $
rsyncparams ++ [File src, File dest] rsyncparams ++ [File src, File dest]
docopycow = docopywith copyCoW
#endif #endif
docopy = liftIO $ watchFileSize dest p $ docopywith a = liftIO $ watchFileSize dest p $
copyFileExternal CopyTimeStamps src dest a CopyTimeStamps src dest
commitOnCleanup :: Git.Repo -> Remote -> Annex a -> Annex a commitOnCleanup :: Git.Repo -> Remote -> Annex a -> Annex a
commitOnCleanup repo r a = go `after` a commitOnCleanup repo r a = go `after` a
@ -768,10 +787,10 @@ wantHardLink = (annexHardLink <$> Annex.getGitConfig)
-- done. -- done.
type Copier = FilePath -> FilePath -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification) type Copier = FilePath -> FilePath -> MeterUpdate -> Annex Bool -> Annex (Bool, Verification)
mkCopier :: Bool -> [CommandParam] -> Annex Copier mkCopier :: Bool -> State -> [CommandParam] -> Annex Copier
mkCopier remotewanthardlink rsyncparams = do mkCopier remotewanthardlink st rsyncparams = do
let copier = \src dest p check -> unVerified $ let copier = \src dest p check -> unVerified $
rsyncOrCopyFile rsyncparams src dest p <&&> check rsyncOrCopyFile st rsyncparams src dest p <&&> check
localwanthardlink <- wantHardLink localwanthardlink <- wantHardLink
let linker = \src dest -> createLink src dest >> return True let linker = \src dest -> createLink src dest >> return True
ifM (pure (remotewanthardlink || localwanthardlink) <&&> not <$> isDirect) ifM (pure (remotewanthardlink || localwanthardlink) <&&> not <$> isDirect)
@ -790,20 +809,21 @@ mkCopier remotewanthardlink rsyncparams = do
- This returns False when the repository UUID is not as expected. -} - This returns False when the repository UUID is not as expected. -}
type DeferredUUIDCheck = Annex Bool type DeferredUUIDCheck = Annex Bool
data State = State Ssh.P2PSshConnectionPool DeferredUUIDCheck (Annex (Git.Repo, GitConfig)) data State = State Ssh.P2PSshConnectionPool DeferredUUIDCheck CopyCoWTried (Annex (Git.Repo, GitConfig))
getRepoFromState :: State -> Annex Git.Repo getRepoFromState :: State -> Annex Git.Repo
getRepoFromState (State _ _ a) = fst <$> a getRepoFromState (State _ _ _ a) = fst <$> a
{- The config of the remote git repository, cached for speed. -} {- The config of the remote git repository, cached for speed. -}
getGitConfigFromState :: State -> Annex GitConfig getGitConfigFromState :: State -> Annex GitConfig
getGitConfigFromState (State _ _ a) = snd <$> a getGitConfigFromState (State _ _ _ a) = snd <$> a
mkState :: Git.Repo -> UUID -> RemoteGitConfig -> Annex State mkState :: Git.Repo -> UUID -> RemoteGitConfig -> Annex State
mkState r u gc = do mkState r u gc = do
pool <- Ssh.mkP2PSshConnectionPool pool <- Ssh.mkP2PSshConnectionPool
copycowtried <- liftIO newCopyCoWTried
(duc, getrepo) <- go (duc, getrepo) <- go
return $ State pool duc getrepo return $ State pool duc copycowtried getrepo
where where
go go
| remoteAnnexCheckUUID gc = return | remoteAnnexCheckUUID gc = return

View file

@ -1,12 +1,13 @@
{- file copying {- file copying
- -
- Copyright 2010-2014 Joey Hess <id@joeyh.name> - Copyright 2010-2019 Joey Hess <id@joeyh.name>
- -
- License: BSD-2-clause - License: BSD-2-clause
-} -}
module Utility.CopyFile ( module Utility.CopyFile (
copyFileExternal, copyFileExternal,
copyCoW,
createLinkOrCopy, createLinkOrCopy,
CopyMetaData(..) CopyMetaData(..)
) where ) where
@ -22,6 +23,17 @@ data CopyMetaData
| CopyAllMetaData | CopyAllMetaData
deriving (Eq) deriving (Eq)
copyMetaDataParams :: CopyMetaData -> [CommandParam]
copyMetaDataParams meta = map snd $ filter fst
[ (allmeta && BuildInfo.cp_a, Param "-a")
, (allmeta && BuildInfo.cp_p && not BuildInfo.cp_a
, Param "-p")
, (not allmeta && BuildInfo.cp_preserve_timestamps
, Param "--preserve=timestamps")
]
where
allmeta = meta == CopyAllMetaData
{- The cp command is used, because I hate reinventing the wheel, {- The cp command is used, because I hate reinventing the wheel,
- and because this allows easy access to features like cp --reflink. -} - and because this allows easy access to features like cp --reflink. -}
copyFileExternal :: CopyMetaData -> FilePath -> FilePath -> IO Bool copyFileExternal :: CopyMetaData -> FilePath -> FilePath -> IO Bool
@ -30,15 +42,33 @@ copyFileExternal meta src dest = do
removeFile dest removeFile dest
boolSystem "cp" $ params ++ [File src, File dest] boolSystem "cp" $ params ++ [File src, File dest]
where where
params = map snd $ filter fst params
[ (BuildInfo.cp_reflink_auto, Param "--reflink=auto") | BuildInfo.cp_reflink_supported =
, (allmeta && BuildInfo.cp_a, Param "-a") Param "--reflink=auto" : copyMetaDataParams meta
, (allmeta && BuildInfo.cp_p && not BuildInfo.cp_a | otherwise = copyMetaDataParams meta
, Param "-p")
, (not allmeta && BuildInfo.cp_preserve_timestamps {- When a filesystem supports CoW (and cp does), uses it to make
, Param "--preserve=timestamps") - an efficient copy of a file. Otherwise, returns False. -}
] copyCoW :: CopyMetaData -> FilePath -> FilePath -> IO Bool
allmeta = meta == CopyAllMetaData copyCoW meta src dest
| BuildInfo.cp_reflink_supported = do
whenM (doesFileExist dest) $
removeFile dest
-- When CoW is not supported, cp will complain to stderr,
-- so have to discard its stderr.
ok <- catchBoolIO $ do
withQuietOutput createProcessSuccess $
proc "cp" $ toCommand $
params ++ [File src, File dest]
return True
-- When CoW is not supported, cp creates the destination
-- file but leaves it empty.
unless ok $
void $ tryIO $ removeFile dest
return ok
| otherwise = return False
where
params = Param "--reflink=always" : copyMetaDataParams meta
{- Create a hard link if the filesystem allows it, and fall back to copying {- Create a hard link if the filesystem allows it, and fall back to copying
- the file. -} - the file. -}

View file

@ -17,3 +17,5 @@ If there is some generic benefit from `rsync`, could it may be at least be a con
[[!meta author="yoh"]] [[!meta author="yoh"]]
> [[done]] --[[Joey]]

View file

@ -0,0 +1,7 @@
[[!comment format=mdwn
username="joey"
subject="""comment 2"""
date="2019-07-17T18:13:17Z"
content="""
CoW probing implemented
"""]]