diff --git a/Backend/WORM.hs b/Backend/WORM.hs index eaf9a9bcb0..f14ff844bd 100644 --- a/Backend/WORM.hs +++ b/Backend/WORM.hs @@ -27,7 +27,7 @@ backend = Backend , getKey = Just keyValue , verifyKeyContent = Nothing , canUpgradeKey = Just needsUpgrade - , fastMigrate = Just removeSpaces + , fastMigrate = Just removeProblemChars , isStableKey = const True } @@ -48,12 +48,13 @@ keyValue source _ = do , keyMtime = Just $ modificationTime stat } -{- Old WORM keys could contain spaces, and can be upgraded to remove them. -} +{- Old WORM keys could contain spaces and carriage returns, + - and can be upgraded to remove them. -} needsUpgrade :: Key -> Bool -needsUpgrade key = ' ' `S8.elem` fromKey keyName key +needsUpgrade key = any (`S8.elem` fromKey keyName key) [' ', '\r'] -removeSpaces :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key) -removeSpaces oldkey newbackend _ +removeProblemChars :: Key -> Backend -> AssociatedFile -> Annex (Maybe Key) +removeProblemChars oldkey newbackend _ | migratable = return $ Just $ alterKey oldkey $ \d -> d { keyName = encodeBS $ reSanitizeKeyName $ decodeBS $ keyName d } | otherwise = return Nothing diff --git a/CHANGELOG b/CHANGELOG index 3b7fc5f4e6..7a0bac044c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -27,6 +27,12 @@ git-annex (8.20200618) UNRELEASED; urgency=medium git-annex sync --content --all gets 20% faster. * Fix a recently introduced bug that could cause a "fork: resource exhausted" after getting several thousand files. + * Sped up the --all option by 2x to 16x by using git cat-file --buffer. + Thanks to Lukey for finding this optimisation. + * fsck: Detect if WORM keys contain a carriage return, and recommend + upgrading to fix the problem, because the --all optimisation above + skips any such keys that might exist (git-annex could have maybe + created such keys back in 2013). -- Joey Hess Thu, 18 Jun 2020 12:21:14 -0400 diff --git a/CmdLine/Seek.hs b/CmdLine/Seek.hs index 7b3ae6c241..817763147e 100644 --- a/CmdLine/Seek.hs +++ b/CmdLine/Seek.hs @@ -22,16 +22,19 @@ import qualified Git.LsTree as LsTree import Git.FilePath import qualified Limit import CmdLine.GitAnnex.Options -import Logs.Location +import Logs import Logs.Unused import Types.Transfer import Logs.Transfer import Remote.List import qualified Remote import Annex.CatFile +import Git.CatFile (catObjectStream) import Annex.CurrentBranch import Annex.Content import Annex.InodeSentinal +import qualified Annex.Branch +import qualified Annex.BranchState import qualified Database.Keys import qualified Utility.RawFilePath as R @@ -180,26 +183,55 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do giveup "Cannot use --auto in a bare repository" case (null params, ko) of (True, Nothing) - | bare -> noauto $ runkeyaction finishCheck loggedKeys + | bare -> noauto runallkeys | otherwise -> fallbackaction params (False, Nothing) -> fallbackaction params - (True, Just WantAllKeys) -> noauto $ runkeyaction finishCheck loggedKeys - (True, Just WantUnusedKeys) -> noauto $ runkeyaction (pure . Just) unusedKeys' + (True, Just WantAllKeys) -> noauto runallkeys + (True, Just WantUnusedKeys) -> noauto $ runkeyaction unusedKeys' (True, Just WantFailedTransfers) -> noauto runfailedtransfers - (True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (pure . Just) (return [k]) - (True, Just WantIncompleteKeys) -> noauto $ runkeyaction (pure . Just) incompletekeys + (True, Just (WantSpecificKey k)) -> noauto $ runkeyaction (return [k]) + (True, Just WantIncompleteKeys) -> noauto $ runkeyaction incompletekeys (True, Just (WantBranchKeys bs)) -> noauto $ runbranchkeys bs (False, Just _) -> giveup "Can only specify one of file names, --all, --branch, --unused, --failed, --key, or --incomplete" where noauto a | auto = giveup "Cannot use --auto with --all or --branch or --unused or --key or --incomplete" | otherwise = a + incompletekeys = staleKeysPrune gitAnnexTmpObjectDir True - runkeyaction checker getks = do + + -- List all location log files on the git-annex branch, + -- and use those to get keys. Pass through cat-file + -- to get the contents of the location logs, and pre-cache + -- those. This significantly speeds up typical operations + -- that need to look at the location log for each key. + runallkeys = do + keyaction <- mkkeyaction + config <- Annex.getGitConfig + g <- Annex.gitRepo + + void Annex.Branch.update + (l, cleanup) <- inRepo $ LsTree.lsTree + LsTree.LsTreeRecursive + Annex.Branch.fullname + let getk = locationLogFileKey config . getTopFilePath + let go reader = liftIO reader >>= \case + Nothing -> return () + Just (f, content) -> do + case getk f of + Just k -> do + Annex.BranchState.setCache (getTopFilePath f) content + keyaction (k, mkActionItem k) + Nothing -> return () + go reader + catObjectStream l (isJust . getk . LsTree.file) g go + liftIO $ void cleanup + + runkeyaction getks = do keyaction <- mkkeyaction ks <- getks - forM_ ks $ checker >=> maybe noop - (\k -> keyaction (k, mkActionItem k)) + forM_ ks $ \k -> keyaction (k, mkActionItem k) + runbranchkeys bs = do keyaction <- mkkeyaction forM_ bs $ \b -> do @@ -211,6 +243,7 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do in keyaction (k, bfp) unlessM (liftIO cleanup) $ error ("git ls-tree " ++ Git.fromRef b ++ " failed") + runfailedtransfers = do keyaction <- mkkeyaction rs <- remoteList diff --git a/Git/CatFile.hs b/Git/CatFile.hs index cd4d8576e1..8927320e89 100644 --- a/Git/CatFile.hs +++ b/Git/CatFile.hs @@ -19,6 +19,7 @@ module Git.CatFile ( catObject, catObjectDetails, catObjectMetaData, + catObjectStream, ) where import System.IO @@ -33,6 +34,7 @@ import Data.Char import Numeric import System.Posix.Types import Text.Read +import Control.Concurrent.Async import Common import Git @@ -42,8 +44,10 @@ import Git.Command import Git.Types import Git.FilePath import Git.HashObject +import qualified Git.LsTree as LsTree import qualified Utility.CoProcess as CoProcess import Utility.Tuple +import Control.Monad.IO.Class (MonadIO) data CatFileHandle = CatFileHandle { catFileProcess :: CoProcess.CoProcessHandle @@ -57,7 +61,7 @@ catFileStart = catFileStart' True catFileStart' :: Bool -> Repo -> IO CatFileHandle catFileStart' restartable repo = CatFileHandle <$> startp "--batch" - <*> startp "--batch-check=%(objectname) %(objecttype) %(objectsize)" + <*> startp ("--batch-check=" ++ batchFormat) <*> pure repo where startp p = gitCoProcessStart restartable @@ -65,6 +69,9 @@ catFileStart' restartable repo = CatFileHandle , Param p ] repo +batchFormat :: String +batchFormat = "%(objectname) %(objecttype) %(objectsize)" + catFileStop :: CatFileHandle -> IO () catFileStop h = do CoProcess.stop (catFileProcess h) @@ -88,18 +95,12 @@ catObjectDetails :: CatFileHandle -> Ref -> IO (Maybe (L.ByteString, Sha, Object catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \from -> do header <- S8.hGetLine from case parseResp object header of - Just (ParsedResp sha objtype size) -> do - content <- S.hGet from (fromIntegral size) - eatchar '\n' from - return $ Just (L.fromChunks [content], sha, objtype) + Just r@(ParsedResp sha objtype _size) -> do + content <- readObjectContent from r + return $ Just (content, sha, objtype) Just DNE -> return Nothing Nothing -> error $ "unknown response from git cat-file " ++ show (header, object) where - eatchar expected from = do - c <- hGetChar from - when (c /= expected) $ - error $ "missing " ++ (show expected) ++ " from git cat-file" - -- Slow fallback path for filenames containing newlines. newlinefallback = queryObjectType object (gitRepo h) >>= \case Nothing -> return Nothing @@ -113,6 +114,18 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f (gitRepo h) return (Just (content, sha, objtype)) +readObjectContent :: Handle -> ParsedResp -> IO L.ByteString +readObjectContent h (ParsedResp _ _ size) = do + content <- S.hGet h (fromIntegral size) + eatchar '\n' + return (L.fromChunks [content]) + where + eatchar expected = do + c <- hGetChar h + when (c /= expected) $ + error $ "missing " ++ (show expected) ++ " from git cat-file" +readObjectContent _ DNE = error "internal" + {- Gets the size and type of an object, without reading its content. -} catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType)) catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do @@ -266,3 +279,69 @@ parseCommit b = Commit sp = fromIntegral (ord ' ') lt = fromIntegral (ord '<') gt = fromIntegral (ord '>') + +{- Uses cat-file to stream the contents of the files listed by lstree + - as efficiently as possible. This is much faster than querying it + - repeatedly per file. + - + - Any files with a newline or carriage return in their name will be + - skipped, because the interface does not support them. + -} +catObjectStream + :: (MonadMask m, MonadIO m) + => [LsTree.TreeItem] + -> (LsTree.TreeItem -> Bool) + -> Repo + -> (IO (Maybe (TopFilePath, L.ByteString)) -> m ()) + -> m () +catObjectStream l want repo a = assertLocal repo $ + bracketIO start stop $ \(_, _, hout, _) -> a (reader hout) + where + feeder h = do + forM_ l $ \ti -> + when (want ti) $ do + let f = getTopFilePath (LsTree.file ti) + -- skip files with newlines or carriage returns + unless (any (`S8.elem` f) ['\n', '\r']) $ + S8.hPutStrLn h $ + fromRef' (LsTree.sha ti) <> " " <> f + hClose h + + reader h = ifM (hIsEOF h) + ( return Nothing + , do + resp <- S8.hGetLine h + case eitherToMaybe $ A.parseOnly respparser resp of + Nothing -> error $ "unknown response from git cat-file " ++ show resp + Just (r, f) -> do + content <- readObjectContent h r + return (Just (asTopFilePath f, content)) + ) + + params = + [ Param "cat-file" + -- %(rest) is used to feed the filename through + -- cat-file; it will be at the end of the response + , Param ("--batch=" ++ batchFormat ++ " %(rest)") + , Param "--buffer" + ] + + respparser = (,) + <$> respParser + <* A8.char ' ' + <*> A.takeByteString + + start = do + let p = gitCreateProcess params repo + (Just hin, Just hout, _, pid) <- createProcess p + { std_in = CreatePipe + , std_out = CreatePipe + } + f <- async (feeder hin) + return (f, hin, hout, pid) + + stop (f, hin, hout, pid) = do + cancel f + hClose hin + hClose hout + void $ checkSuccessProcess pid diff --git a/doc/todo/speed_up_git_annex_sync_--content_--all/comment_10_d1d462a22ad849c62e8620a12f3ba064._comment b/doc/todo/speed_up_git_annex_sync_--content_--all/comment_10_d1d462a22ad849c62e8620a12f3ba064._comment new file mode 100644 index 0000000000..f92977a34f --- /dev/null +++ b/doc/todo/speed_up_git_annex_sync_--content_--all/comment_10_d1d462a22ad849c62e8620a12f3ba064._comment @@ -0,0 +1,19 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 10""" + date="2020-07-07T17:13:46Z" + content=""" +Wow, I implemented your --buffer trick, and `get --all` +is over 2x faster. `sync --content --all` somewhat less, but still +another decent improvement there. (cold cache timings) + +And some warm cache times are *much* faster than my cold cache benchmarks. +`get --all` is 17x faster in a 10k file repo, which makes it only 3x slower +than `get` without --all. + +I think I will still leave this open because it's still worth considering +sqlite caching or finding a way to speed up the second sync --all pass... +but would be interested to know how your use case is improved now. + +Please feel free to find optimisations anytime, I really appreciate it. +"""]]