use ByteStrings when reading annex symlinks and pointers

Now there's a ByteString used all the way from disk to Key.

The main complication in this conversion was the use of fromInternalGitPath
in several places to munge things on Windows. The things that used that
were changed to parse the ByteString using either path separator.

Also some code that had read from files to a String lazily was changed
to read a minimal strict ByteString.
This commit is contained in:
Joey Hess 2019-01-14 15:19:20 -04:00
parent 0a8d93cb8a
commit 5d98cba923
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
10 changed files with 128 additions and 78 deletions

View file

@ -105,7 +105,7 @@ catKey ref = go =<< catObjectMetaData ref
-- Avoid catting large files, that cannot be symlinks or -- Avoid catting large files, that cannot be symlinks or
-- pointer files, which would require buffering their -- pointer files, which would require buffering their
-- content in memory, as well as a lot of IO. -- content in memory, as well as a lot of IO.
| sz <= maxPointerSz = parseLinkOrPointer <$> catObject ref | sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
go _ = return Nothing go _ = return Nothing
{- Gets a symlink target. -} {- Gets a symlink target. -}

View file

@ -7,7 +7,7 @@
- -
- Pointer files are used instead of symlinks for unlocked files. - Pointer files are used instead of symlinks for unlocked files.
- -
- Copyright 2013-2018 Joey Hess <id@joeyh.name> - Copyright 2013-2019 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -33,14 +33,17 @@ import Utility.FileMode
import Utility.InodeCache import Utility.InodeCache
import Utility.Tmp.Dir import Utility.Tmp.Dir
import Utility.CopyFile import Utility.CopyFile
import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
type LinkTarget = String type LinkTarget = String
{- Checks if a file is a link to a key. -} {- Checks if a file is a link to a key. -}
isAnnexLink :: FilePath -> Annex (Maybe Key) isAnnexLink :: FilePath -> Annex (Maybe Key)
isAnnexLink file = maybe Nothing (fileKey . takeFileName) <$> getAnnexLinkTarget file isAnnexLink file = maybe Nothing parseLinkTargetOrPointer <$> getAnnexLinkTarget file
{- Gets the link target of a symlink. {- Gets the link target of a symlink.
- -
@ -50,40 +53,42 @@ isAnnexLink file = maybe Nothing (fileKey . takeFileName) <$> getAnnexLinkTarget
- Returns Nothing if the file is not a symlink, or not a link to annex - Returns Nothing if the file is not a symlink, or not a link to annex
- content. - content.
-} -}
getAnnexLinkTarget :: FilePath -> Annex (Maybe LinkTarget) getAnnexLinkTarget :: FilePath -> Annex (Maybe S.ByteString)
getAnnexLinkTarget f = getAnnexLinkTarget' f getAnnexLinkTarget f = getAnnexLinkTarget' f
=<< (coreSymlinks <$> Annex.getGitConfig) =<< (coreSymlinks <$> Annex.getGitConfig)
{- Pass False to force looking inside file. -} {- Pass False to force looking inside file, for when git checks out
getAnnexLinkTarget' :: FilePath -> Bool -> Annex (Maybe LinkTarget) - symlinks as plain files. -}
getAnnexLinkTarget' :: FilePath -> Bool -> Annex (Maybe S.ByteString)
getAnnexLinkTarget' file coresymlinks = if coresymlinks getAnnexLinkTarget' file coresymlinks = if coresymlinks
then check readSymbolicLink $ then check probesymlink $
return Nothing return Nothing
else check readSymbolicLink $ else check probesymlink $
check probefilecontent $ check probefilecontent $
return Nothing return Nothing
where where
check getlinktarget fallback = check getlinktarget fallback =
liftIO (catchMaybeIO $ getlinktarget file) >>= \case liftIO (catchMaybeIO getlinktarget) >>= \case
Just l Just l
| isLinkToAnnex (fromInternalGitPath l) -> return (Just l) | isLinkToAnnex l -> return (Just l)
| otherwise -> return Nothing | otherwise -> return Nothing
Nothing -> fallback Nothing -> fallback
probefilecontent f = withFile f ReadMode $ \h -> do probesymlink = R.readSymbolicLink $ toRawFilePath file
-- The first 8k is more than enough to read; link
-- files are small. probefilecontent = withFile file ReadMode $ \h -> do
s <- take 8192 <$> hGetContents h s <- S.hGet h unpaddedMaxPointerSz
-- If we got the full 8k, the file is too large -- If we got the full amount, the file is too large
if length s == 8192 -- to be a symlink target.
then return "" return $ if S.length s == unpaddedMaxPointerSz
then mempty
else else
-- If there are any NUL or newline -- If there are any NUL or newline
-- characters, or whitespace, we -- characters, or whitespace, we
-- certianly don't have a link to a -- certianly don't have a symlink to a
-- git-annex key. -- git-annex key.
return $ if any (`elem` s) "\0\n\r \t" if any (`S8.elem` s) "\0\n\r \t"
then "" then mempty
else s else s
makeAnnexLink :: LinkTarget -> FilePath -> Annex () makeAnnexLink :: LinkTarget -> FilePath -> Annex ()
@ -122,7 +127,7 @@ stageSymlink file sha =
{- Injects a pointer file content into git, returning its Sha. -} {- Injects a pointer file content into git, returning its Sha. -}
hashPointerFile :: Key -> Annex Sha hashPointerFile :: Key -> Annex Sha
hashPointerFile key = hashBlob $ encodeBS $ formatPointer key hashPointerFile key = hashBlob $ formatPointer key
{- Stages a pointer file, using a Sha of its content -} {- Stages a pointer file, using a Sha of its content -}
stagePointerFile :: FilePath -> Maybe FileMode -> Sha -> Annex () stagePointerFile :: FilePath -> Maybe FileMode -> Sha -> Annex ()
@ -136,7 +141,7 @@ stagePointerFile file mode sha =
writePointerFile :: FilePath -> Key -> Maybe FileMode -> IO () writePointerFile :: FilePath -> Key -> Maybe FileMode -> IO ()
writePointerFile file k mode = do writePointerFile file k mode = do
writeFile file (formatPointer k) S.writeFile file (formatPointer k)
maybe noop (setFileMode file) mode maybe noop (setFileMode file) mode
newtype Restage = Restage Bool newtype Restage = Restage Bool
@ -222,49 +227,58 @@ unableToRestage mf = unwords
, "git update-index -q --refresh " ++ fromMaybe "<file>" mf , "git update-index -q --refresh " ++ fromMaybe "<file>" mf
] ]
{- Parses a symlink target or a pointer file to a Key. {- Parses a symlink target or a pointer file to a Key. -}
- Only looks at the first line, as pointer files can have subsequent parseLinkTargetOrPointer :: S.ByteString -> Maybe Key
- lines. -} parseLinkTargetOrPointer = parseLinkTarget . S8.takeWhile (not . lineend)
parseLinkOrPointer :: L.ByteString -> Maybe Key
parseLinkOrPointer = parseLinkOrPointer'
. decodeBL . L.take (fromIntegral maxPointerSz)
where where
{- Want to avoid buffering really big files in git into
- memory when reading files that may be pointers.
-
- 8192 bytes is plenty for a pointer to a key.
- Pad some more to allow for any pointer files that might have
- lines after the key explaining what the file is used for. -}
maxPointerSz :: Integer
maxPointerSz = 81920
parseLinkOrPointer' :: String -> Maybe Key
parseLinkOrPointer' = go . fromInternalGitPath . takeWhile (not . lineend)
where
go l
| isLinkToAnnex l = fileKey $ takeFileName l
| otherwise = Nothing
lineend '\n' = True lineend '\n' = True
lineend '\r' = True lineend '\r' = True
lineend _ = False lineend _ = False
formatPointer :: Key -> String {- Avoid looking at more of the lazy ByteString than necessary since it
formatPointer k = - could be reading from a large file that is not a pointer file. -}
toInternalGitPath (pathSeparator:objectDir </> keyFile k) ++ "\n" parseLinkTargetOrPointerLazy :: L.ByteString -> Maybe Key
parseLinkTargetOrPointerLazy b =
let b' = L.take (fromIntegral maxPointerSz) b
in parseLinkTargetOrPointer (L.toStrict b')
{- Parses a symlink target to a Key. -}
parseLinkTarget :: S.ByteString -> Maybe Key
parseLinkTarget l
| isLinkToAnnex l = fileKey' $ snd $ S8.breakEnd pathsep l
| otherwise = Nothing
where
pathsep '/' = True
#ifdef mingw32_HOST_OS
pathsep '\\' = True
#endif
pathsep _ = False
formatPointer :: Key -> S.ByteString
formatPointer k = prefix <> keyFile' k <> nl
where
prefix = toRawFilePath $ toInternalGitPath (pathSeparator:objectDir)
nl = S8.singleton '\n'
{- Maximum size of a file that could be a pointer to a key.
- Check to avoid buffering really big files in git into
- memory when reading files that may be pointers.
-
- 8192 bytes is plenty for a pointer to a key. This adds some additional
- padding to allow for any pointer files that might have
- lines after the key explaining what the file is used for. -}
maxPointerSz :: Integer
maxPointerSz = 81920
unpaddedMaxPointerSz :: Int
unpaddedMaxPointerSz = 8192
{- Checks if a worktree file is a pointer to a key. {- Checks if a worktree file is a pointer to a key.
- -
- Unlocked files whose content is present are not detected by this. -} - Unlocked files whose content is present are not detected by this. -}
isPointerFile :: FilePath -> IO (Maybe Key) isPointerFile :: FilePath -> IO (Maybe Key)
isPointerFile f = catchDefaultIO Nothing $ bracket open close $ \h -> do isPointerFile f = catchDefaultIO Nothing $ withFile f ReadMode $ \h ->
b <- take (fromIntegral maxPointerSz) <$> hGetContents h parseLinkTargetOrPointer <$> S.hGet h unpaddedMaxPointerSz
-- strict so it reads before the file handle is closed
let !mk = parseLinkOrPointer' b
return mk
where
open = openBinaryFile f ReadMode
close = hClose
{- Checks a symlink target or pointer file first line to see if it {- Checks a symlink target or pointer file first line to see if it
- appears to point to annexed content. - appears to point to annexed content.
@ -273,10 +287,15 @@ isPointerFile f = catchDefaultIO Nothing $ bracket open close $ \h -> do
- directory itself, because GIT_DIR may cause a directory name other - directory itself, because GIT_DIR may cause a directory name other
- than .git to be used. - than .git to be used.
-} -}
isLinkToAnnex :: FilePath -> Bool isLinkToAnnex :: S.ByteString -> Bool
isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s isLinkToAnnex s = p `S.isInfixOf` s
#ifdef mingw32_HOST_OS #ifdef mingw32_HOST_OS
-- '/' is still used inside pointer files on Windows, not the native -- '/' is still used inside pointer files on Windows, not the native
-- '\' -- '\'
|| ('/':objectDir) `isInfixOf` s || p' `S.isInfixOf` s
#endif
where
p = toRawFilePath (pathSeparator:objectDir)
#ifdef mingw32_HOST_OS
p' = toRawFilePath ('/':objectDir)
#endif #endif

View file

@ -302,11 +302,11 @@ onAddUnlocked' isdirect contentchanged addassociatedfile addlink samefilestatus
case linktarget of case linktarget of
Nothing -> a Nothing -> a
Just lt -> do Just lt -> do
case fileKey $ takeFileName lt of case parseLinkTarget lt of
Nothing -> noop Nothing -> noop
Just key -> liftAnnex $ Just key -> liftAnnex $
addassociatedfile key file addassociatedfile key file
onAddSymlink' linktarget mk isdirect file fs onAddSymlink' (Just $ fromRawFilePath lt) mk isdirect file fs
{- A symlink might be an arbitrary symlink, which is just added. {- A symlink might be an arbitrary symlink, which is just added.
- Or, if it is a git-annex symlink, ensure it points to the content - Or, if it is a git-annex symlink, ensure it points to the content

View file

@ -88,7 +88,7 @@ fixupReq req@(Req {}) =
check getfile getmode setfile r = case readTreeItemType (getmode r) of check getfile getmode setfile r = case readTreeItemType (getmode r) of
Just TreeSymlink -> do Just TreeSymlink -> do
v <- getAnnexLinkTarget' (getfile r) False v <- getAnnexLinkTarget' (getfile r) False
case fileKey . takeFileName =<< v of case parseLinkTargetOrPointer =<< v of
Nothing -> return r Nothing -> return r
Just k -> setfile r <$> Just k -> setfile r <$>
withObjectLoc k withObjectLoc k

View file

@ -205,8 +205,7 @@ performKey key backend numcopies = do
check :: [Annex Bool] -> Annex Bool check :: [Annex Bool] -> Annex Bool
check cs = and <$> sequence cs check cs = and <$> sequence cs
{- Checks that symlinks points correctly to the annexed content. {- Checks that symlinks points correctly to the annexed content. -}
-}
fixLink :: Key -> FilePath -> Annex Bool fixLink :: Key -> FilePath -> Annex Bool
fixLink key file = do fixLink key file = do
want <- calcRepo $ gitAnnexLink file key want <- calcRepo $ gitAnnexLink file key
@ -215,7 +214,7 @@ fixLink key file = do
return True return True
where where
go want have go want have
| want /= fromInternalGitPath have = do | want /= fromInternalGitPath (fromRawFilePath have) = do
showNote "fixing link" showNote "fixing link"
liftIO $ createDirectoryIfMissing True (parentDir file) liftIO $ createDirectoryIfMissing True (parentDir file)
liftIO $ removeFile file liftIO $ removeFile file
@ -562,7 +561,7 @@ badContentDirect file key = do
badContentRemote :: Remote -> FilePath -> Key -> Annex String badContentRemote :: Remote -> FilePath -> Key -> Annex String
badContentRemote remote localcopy key = do badContentRemote remote localcopy key = do
bad <- fromRepo gitAnnexBadDir bad <- fromRepo gitAnnexBadDir
let destbad = bad </> fileKey key let destbad = bad </> keyFile key
movedbad <- ifM (inAnnex key <||> liftIO (doesFileExist destbad)) movedbad <- ifM (inAnnex key <||> liftIO (doesFileExist destbad))
( return False ( return False
, do , do

View file

@ -22,7 +22,8 @@ import qualified Git
import qualified Git.Ref import qualified Git.Ref
import Backend import Backend
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
cmd :: Command cmd :: Command
cmd = noCommit $ noMessages $ cmd = noCommit $ noMessages $
@ -62,14 +63,14 @@ seek UpdateOption = commandAction update
-- smudge filter in memory, which is a problem with large files. -- smudge filter in memory, which is a problem with large files.
smudge :: FilePath -> CommandStart smudge :: FilePath -> CommandStart
smudge file = do smudge file = do
b <- liftIO $ B.hGetContents stdin b <- liftIO $ L.hGetContents stdin
case parseLinkOrPointer b of case parseLinkTargetOrPointerLazy b of
Nothing -> noop Nothing -> noop
Just k -> do Just k -> do
topfile <- inRepo (toTopFilePath file) topfile <- inRepo (toTopFilePath file)
Database.Keys.addAssociatedFile k topfile Database.Keys.addAssociatedFile k topfile
void $ smudgeLog k topfile void $ smudgeLog k topfile
liftIO $ B.putStr b liftIO $ L.putStr b
stop stop
-- Clean filter is fed file content on stdin, decides if a file -- Clean filter is fed file content on stdin, decides if a file
@ -77,13 +78,13 @@ smudge file = do
-- injested content if so. Otherwise, the original content. -- injested content if so. Otherwise, the original content.
clean :: FilePath -> CommandStart clean :: FilePath -> CommandStart
clean file = do clean file = do
b <- liftIO $ B.hGetContents stdin b <- liftIO $ L.hGetContents stdin
ifM fileoutsiderepo ifM fileoutsiderepo
( liftIO $ B.hPut stdout b ( liftIO $ L.hPut stdout b
, case parseLinkOrPointer b of , case parseLinkTargetOrPointerLazy b of
Just k -> do Just k -> do
getMoveRaceRecovery k file getMoveRaceRecovery k file
liftIO $ B.hPut stdout b liftIO $ L.hPut stdout b
Nothing -> go b =<< catKeyFile file Nothing -> go b =<< catKeyFile file
) )
stop stop
@ -97,7 +98,7 @@ clean file = do
-- to free memory when sending the file, so the -- to free memory when sending the file, so the
-- less we let it send, the less memory it will waste.) -- less we let it send, the less memory it will waste.)
if Git.BuildVersion.older "2.5" if Git.BuildVersion.older "2.5"
then B.length b `seq` return () then L.length b `seq` return ()
else liftIO $ hClose stdin else liftIO $ hClose stdin
-- Optimization for the case when the file is already -- Optimization for the case when the file is already
@ -108,7 +109,7 @@ clean file = do
( liftIO $ emitPointer ko ( liftIO $ emitPointer ko
, doingest oldkey , doingest oldkey
) )
, liftIO $ B.hPut stdout b , liftIO $ L.hPut stdout b
) )
doingest oldkey = do doingest oldkey = do
@ -158,7 +159,7 @@ shouldAnnex file moldkey = do
Nothing -> isNothing <$> catObjectMetaData (Git.Ref.fileRef file) Nothing -> isNothing <$> catObjectMetaData (Git.Ref.fileRef file)
emitPointer :: Key -> IO () emitPointer :: Key -> IO ()
emitPointer = putStr . formatPointer emitPointer = S.putStr . formatPointer
-- Recover from a previous race between eg git mv and git-annex get. -- Recover from a previous race between eg git mv and git-annex get.
-- That could result in the file remaining a pointer file, while -- That could result in the file remaining a pointer file, while

View file

@ -197,7 +197,7 @@ lookupFile1 file = do
makekey l = case maybeLookupBackendVariety (keyVariety k) of makekey l = case maybeLookupBackendVariety (keyVariety k) of
Nothing -> do Nothing -> do
unless (null kname || null bname || unless (null kname || null bname ||
not (isLinkToAnnex l)) $ not (isLinkToAnnex (toRawFilePath l))) $
warning skip warning skip
return Nothing return Nothing
Just backend -> return $ Just (k, backend) Just backend -> return $ Just (k, backend)

View file

@ -29,6 +29,8 @@ import Git.Ref
import Utility.InodeCache import Utility.InodeCache
import Annex.AdjustedBranch import Annex.AdjustedBranch
import qualified Data.ByteString as S
upgrade :: Bool -> Annex Bool upgrade :: Bool -> Annex Bool
upgrade automatic = do upgrade automatic = do
unless automatic $ unless automatic $
@ -117,7 +119,7 @@ upgradeDirectWorkTree = do
void $ linkToAnnex k f ic void $ linkToAnnex k f ic
writepointer f k = liftIO $ do writepointer f k = liftIO $ do
nukeFile f nukeFile f
writeFile f (formatPointer k) S.writeFile f (formatPointer k)
{- Remove all direct mode bookkeeping files. -} {- Remove all direct mode bookkeeping files. -}
removeDirectCruft :: Annex () removeDirectCruft :: Annex ()

28
Utility/RawFilePath.hs Normal file
View file

@ -0,0 +1,28 @@
{- Portability shim around System.Posix.Files.ByteString
-
- Copyright 2019 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
{-# LANGUAGE CPP #-}
module Utility.RawFilePath (
RawFilePath,
readSymbolicLink,
) where
#ifndef mingw32_HOST_OS
import System.Posix.Files.ByteString
import System.Posix.ByteString.FilePath
#else
import qualified Data.ByteString as B
import System.IO.Error
type RawFilePath = B.ByteString
readSymbolicLink :: RawFilePath -> IO RawFilePath
readSymbolicLink _ = ioError $ mkIOError illegalOperationErrorType x Nothing Nothing
where
x = "Utility.RawFilePath.readSymbolicLink: not supported"
#endif

View file

@ -1057,6 +1057,7 @@ Executable git-annex
Utility.Process.Shim Utility.Process.Shim
Utility.Process.Transcript Utility.Process.Transcript
Utility.QuickCheck Utility.QuickCheck
Utility.RawFilePath
Utility.Rsync Utility.Rsync
Utility.SafeCommand Utility.SafeCommand
Utility.Scheduled Utility.Scheduled