use ByteStrings when reading annex symlinks and pointers

Now there's a ByteString used all the way from disk to Key.

The main complication in this conversion was the use of fromInternalGitPath
in several places to munge things on Windows. The things that used that
were changed to parse the ByteString using either path separator.

Also some code that had read from files to a String lazily was changed
to read a minimal strict ByteString.
This commit is contained in:
Joey Hess 2019-01-14 15:19:20 -04:00
parent 0a8d93cb8a
commit 5d98cba923
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
10 changed files with 128 additions and 78 deletions

View file

@ -105,7 +105,7 @@ catKey ref = go =<< catObjectMetaData ref
-- Avoid catting large files, that cannot be symlinks or
-- pointer files, which would require buffering their
-- content in memory, as well as a lot of IO.
| sz <= maxPointerSz = parseLinkOrPointer <$> catObject ref
| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
go _ = return Nothing
{- Gets a symlink target. -}

View file

@ -7,7 +7,7 @@
-
- Pointer files are used instead of symlinks for unlocked files.
-
- Copyright 2013-2018 Joey Hess <id@joeyh.name>
- Copyright 2013-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -33,14 +33,17 @@ import Utility.FileMode
import Utility.InodeCache
import Utility.Tmp.Dir
import Utility.CopyFile
import qualified Utility.RawFilePath as R
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
type LinkTarget = String
{- Checks if a file is a link to a key. -}
isAnnexLink :: FilePath -> Annex (Maybe Key)
isAnnexLink file = maybe Nothing (fileKey . takeFileName) <$> getAnnexLinkTarget file
isAnnexLink file = maybe Nothing parseLinkTargetOrPointer <$> getAnnexLinkTarget file
{- Gets the link target of a symlink.
-
@ -50,40 +53,42 @@ isAnnexLink file = maybe Nothing (fileKey . takeFileName) <$> getAnnexLinkTarget
- Returns Nothing if the file is not a symlink, or not a link to annex
- content.
-}
getAnnexLinkTarget :: FilePath -> Annex (Maybe LinkTarget)
getAnnexLinkTarget :: FilePath -> Annex (Maybe S.ByteString)
getAnnexLinkTarget f = getAnnexLinkTarget' f
=<< (coreSymlinks <$> Annex.getGitConfig)
{- Pass False to force looking inside file. -}
getAnnexLinkTarget' :: FilePath -> Bool -> Annex (Maybe LinkTarget)
{- Pass False to force looking inside file, for when git checks out
- symlinks as plain files. -}
getAnnexLinkTarget' :: FilePath -> Bool -> Annex (Maybe S.ByteString)
getAnnexLinkTarget' file coresymlinks = if coresymlinks
then check readSymbolicLink $
then check probesymlink $
return Nothing
else check readSymbolicLink $
else check probesymlink $
check probefilecontent $
return Nothing
where
check getlinktarget fallback =
liftIO (catchMaybeIO $ getlinktarget file) >>= \case
liftIO (catchMaybeIO getlinktarget) >>= \case
Just l
| isLinkToAnnex (fromInternalGitPath l) -> return (Just l)
| isLinkToAnnex l -> return (Just l)
| otherwise -> return Nothing
Nothing -> fallback
probefilecontent f = withFile f ReadMode $ \h -> do
-- The first 8k is more than enough to read; link
-- files are small.
s <- take 8192 <$> hGetContents h
-- If we got the full 8k, the file is too large
if length s == 8192
then return ""
probesymlink = R.readSymbolicLink $ toRawFilePath file
probefilecontent = withFile file ReadMode $ \h -> do
s <- S.hGet h unpaddedMaxPointerSz
-- If we got the full amount, the file is too large
-- to be a symlink target.
return $ if S.length s == unpaddedMaxPointerSz
then mempty
else
-- If there are any NUL or newline
-- characters, or whitespace, we
-- certianly don't have a link to a
-- certianly don't have a symlink to a
-- git-annex key.
return $ if any (`elem` s) "\0\n\r \t"
then ""
if any (`S8.elem` s) "\0\n\r \t"
then mempty
else s
makeAnnexLink :: LinkTarget -> FilePath -> Annex ()
@ -122,7 +127,7 @@ stageSymlink file sha =
{- Injects a pointer file content into git, returning its Sha. -}
hashPointerFile :: Key -> Annex Sha
hashPointerFile key = hashBlob $ encodeBS $ formatPointer key
hashPointerFile key = hashBlob $ formatPointer key
{- Stages a pointer file, using a Sha of its content -}
stagePointerFile :: FilePath -> Maybe FileMode -> Sha -> Annex ()
@ -136,7 +141,7 @@ stagePointerFile file mode sha =
writePointerFile :: FilePath -> Key -> Maybe FileMode -> IO ()
writePointerFile file k mode = do
writeFile file (formatPointer k)
S.writeFile file (formatPointer k)
maybe noop (setFileMode file) mode
newtype Restage = Restage Bool
@ -222,49 +227,58 @@ unableToRestage mf = unwords
, "git update-index -q --refresh " ++ fromMaybe "<file>" mf
]
{- Parses a symlink target or a pointer file to a Key.
- Only looks at the first line, as pointer files can have subsequent
- lines. -}
parseLinkOrPointer :: L.ByteString -> Maybe Key
parseLinkOrPointer = parseLinkOrPointer'
. decodeBL . L.take (fromIntegral maxPointerSz)
{- Parses a symlink target or a pointer file to a Key. -}
parseLinkTargetOrPointer :: S.ByteString -> Maybe Key
parseLinkTargetOrPointer = parseLinkTarget . S8.takeWhile (not . lineend)
where
{- Want to avoid buffering really big files in git into
- memory when reading files that may be pointers.
-
- 8192 bytes is plenty for a pointer to a key.
- Pad some more to allow for any pointer files that might have
- lines after the key explaining what the file is used for. -}
maxPointerSz :: Integer
maxPointerSz = 81920
parseLinkOrPointer' :: String -> Maybe Key
parseLinkOrPointer' = go . fromInternalGitPath . takeWhile (not . lineend)
where
go l
| isLinkToAnnex l = fileKey $ takeFileName l
| otherwise = Nothing
lineend '\n' = True
lineend '\r' = True
lineend _ = False
formatPointer :: Key -> String
formatPointer k =
toInternalGitPath (pathSeparator:objectDir </> keyFile k) ++ "\n"
{- Avoid looking at more of the lazy ByteString than necessary since it
- could be reading from a large file that is not a pointer file. -}
parseLinkTargetOrPointerLazy :: L.ByteString -> Maybe Key
parseLinkTargetOrPointerLazy b =
let b' = L.take (fromIntegral maxPointerSz) b
in parseLinkTargetOrPointer (L.toStrict b')
{- Parses a symlink target to a Key. -}
parseLinkTarget :: S.ByteString -> Maybe Key
parseLinkTarget l
| isLinkToAnnex l = fileKey' $ snd $ S8.breakEnd pathsep l
| otherwise = Nothing
where
pathsep '/' = True
#ifdef mingw32_HOST_OS
pathsep '\\' = True
#endif
pathsep _ = False
formatPointer :: Key -> S.ByteString
formatPointer k = prefix <> keyFile' k <> nl
where
prefix = toRawFilePath $ toInternalGitPath (pathSeparator:objectDir)
nl = S8.singleton '\n'
{- Maximum size of a file that could be a pointer to a key.
- Check to avoid buffering really big files in git into
- memory when reading files that may be pointers.
-
- 8192 bytes is plenty for a pointer to a key. This adds some additional
- padding to allow for any pointer files that might have
- lines after the key explaining what the file is used for. -}
maxPointerSz :: Integer
maxPointerSz = 81920
unpaddedMaxPointerSz :: Int
unpaddedMaxPointerSz = 8192
{- Checks if a worktree file is a pointer to a key.
-
- Unlocked files whose content is present are not detected by this. -}
isPointerFile :: FilePath -> IO (Maybe Key)
isPointerFile f = catchDefaultIO Nothing $ bracket open close $ \h -> do
b <- take (fromIntegral maxPointerSz) <$> hGetContents h
-- strict so it reads before the file handle is closed
let !mk = parseLinkOrPointer' b
return mk
where
open = openBinaryFile f ReadMode
close = hClose
isPointerFile f = catchDefaultIO Nothing $ withFile f ReadMode $ \h ->
parseLinkTargetOrPointer <$> S.hGet h unpaddedMaxPointerSz
{- Checks a symlink target or pointer file first line to see if it
- appears to point to annexed content.
@ -273,10 +287,15 @@ isPointerFile f = catchDefaultIO Nothing $ bracket open close $ \h -> do
- directory itself, because GIT_DIR may cause a directory name other
- than .git to be used.
-}
isLinkToAnnex :: FilePath -> Bool
isLinkToAnnex s = (pathSeparator:objectDir) `isInfixOf` s
isLinkToAnnex :: S.ByteString -> Bool
isLinkToAnnex s = p `S.isInfixOf` s
#ifdef mingw32_HOST_OS
-- '/' is still used inside pointer files on Windows, not the native
-- '\'
|| ('/':objectDir) `isInfixOf` s
|| p' `S.isInfixOf` s
#endif
where
p = toRawFilePath (pathSeparator:objectDir)
#ifdef mingw32_HOST_OS
p' = toRawFilePath ('/':objectDir)
#endif

View file

@ -302,11 +302,11 @@ onAddUnlocked' isdirect contentchanged addassociatedfile addlink samefilestatus
case linktarget of
Nothing -> a
Just lt -> do
case fileKey $ takeFileName lt of
case parseLinkTarget lt of
Nothing -> noop
Just key -> liftAnnex $
addassociatedfile key file
onAddSymlink' linktarget mk isdirect file fs
onAddSymlink' (Just $ fromRawFilePath lt) mk isdirect file fs
{- A symlink might be an arbitrary symlink, which is just added.
- Or, if it is a git-annex symlink, ensure it points to the content

View file

@ -88,7 +88,7 @@ fixupReq req@(Req {}) =
check getfile getmode setfile r = case readTreeItemType (getmode r) of
Just TreeSymlink -> do
v <- getAnnexLinkTarget' (getfile r) False
case fileKey . takeFileName =<< v of
case parseLinkTargetOrPointer =<< v of
Nothing -> return r
Just k -> setfile r <$>
withObjectLoc k

View file

@ -205,8 +205,7 @@ performKey key backend numcopies = do
check :: [Annex Bool] -> Annex Bool
check cs = and <$> sequence cs
{- Checks that symlinks points correctly to the annexed content.
-}
{- Checks that symlinks points correctly to the annexed content. -}
fixLink :: Key -> FilePath -> Annex Bool
fixLink key file = do
want <- calcRepo $ gitAnnexLink file key
@ -215,7 +214,7 @@ fixLink key file = do
return True
where
go want have
| want /= fromInternalGitPath have = do
| want /= fromInternalGitPath (fromRawFilePath have) = do
showNote "fixing link"
liftIO $ createDirectoryIfMissing True (parentDir file)
liftIO $ removeFile file
@ -562,7 +561,7 @@ badContentDirect file key = do
badContentRemote :: Remote -> FilePath -> Key -> Annex String
badContentRemote remote localcopy key = do
bad <- fromRepo gitAnnexBadDir
let destbad = bad </> fileKey key
let destbad = bad </> keyFile key
movedbad <- ifM (inAnnex key <||> liftIO (doesFileExist destbad))
( return False
, do

View file

@ -22,7 +22,8 @@ import qualified Git
import qualified Git.Ref
import Backend
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
cmd :: Command
cmd = noCommit $ noMessages $
@ -62,14 +63,14 @@ seek UpdateOption = commandAction update
-- smudge filter in memory, which is a problem with large files.
smudge :: FilePath -> CommandStart
smudge file = do
b <- liftIO $ B.hGetContents stdin
case parseLinkOrPointer b of
b <- liftIO $ L.hGetContents stdin
case parseLinkTargetOrPointerLazy b of
Nothing -> noop
Just k -> do
topfile <- inRepo (toTopFilePath file)
Database.Keys.addAssociatedFile k topfile
void $ smudgeLog k topfile
liftIO $ B.putStr b
liftIO $ L.putStr b
stop
-- Clean filter is fed file content on stdin, decides if a file
@ -77,13 +78,13 @@ smudge file = do
-- injested content if so. Otherwise, the original content.
clean :: FilePath -> CommandStart
clean file = do
b <- liftIO $ B.hGetContents stdin
b <- liftIO $ L.hGetContents stdin
ifM fileoutsiderepo
( liftIO $ B.hPut stdout b
, case parseLinkOrPointer b of
( liftIO $ L.hPut stdout b
, case parseLinkTargetOrPointerLazy b of
Just k -> do
getMoveRaceRecovery k file
liftIO $ B.hPut stdout b
liftIO $ L.hPut stdout b
Nothing -> go b =<< catKeyFile file
)
stop
@ -97,7 +98,7 @@ clean file = do
-- to free memory when sending the file, so the
-- less we let it send, the less memory it will waste.)
if Git.BuildVersion.older "2.5"
then B.length b `seq` return ()
then L.length b `seq` return ()
else liftIO $ hClose stdin
-- Optimization for the case when the file is already
@ -108,7 +109,7 @@ clean file = do
( liftIO $ emitPointer ko
, doingest oldkey
)
, liftIO $ B.hPut stdout b
, liftIO $ L.hPut stdout b
)
doingest oldkey = do
@ -158,7 +159,7 @@ shouldAnnex file moldkey = do
Nothing -> isNothing <$> catObjectMetaData (Git.Ref.fileRef file)
emitPointer :: Key -> IO ()
emitPointer = putStr . formatPointer
emitPointer = S.putStr . formatPointer
-- Recover from a previous race between eg git mv and git-annex get.
-- That could result in the file remaining a pointer file, while

View file

@ -197,7 +197,7 @@ lookupFile1 file = do
makekey l = case maybeLookupBackendVariety (keyVariety k) of
Nothing -> do
unless (null kname || null bname ||
not (isLinkToAnnex l)) $
not (isLinkToAnnex (toRawFilePath l))) $
warning skip
return Nothing
Just backend -> return $ Just (k, backend)

View file

@ -29,6 +29,8 @@ import Git.Ref
import Utility.InodeCache
import Annex.AdjustedBranch
import qualified Data.ByteString as S
upgrade :: Bool -> Annex Bool
upgrade automatic = do
unless automatic $
@ -117,7 +119,7 @@ upgradeDirectWorkTree = do
void $ linkToAnnex k f ic
writepointer f k = liftIO $ do
nukeFile f
writeFile f (formatPointer k)
S.writeFile f (formatPointer k)
{- Remove all direct mode bookkeeping files. -}
removeDirectCruft :: Annex ()

28
Utility/RawFilePath.hs Normal file
View file

@ -0,0 +1,28 @@
{- Portability shim around System.Posix.Files.ByteString
-
- Copyright 2019 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
{-# LANGUAGE CPP #-}
module Utility.RawFilePath (
RawFilePath,
readSymbolicLink,
) where
#ifndef mingw32_HOST_OS
import System.Posix.Files.ByteString
import System.Posix.ByteString.FilePath
#else
import qualified Data.ByteString as B
import System.IO.Error
type RawFilePath = B.ByteString
readSymbolicLink :: RawFilePath -> IO RawFilePath
readSymbolicLink _ = ioError $ mkIOError illegalOperationErrorType x Nothing Nothing
where
x = "Utility.RawFilePath.readSymbolicLink: not supported"
#endif

View file

@ -1057,6 +1057,7 @@ Executable git-annex
Utility.Process.Shim
Utility.Process.Transcript
Utility.QuickCheck
Utility.RawFilePath
Utility.Rsync
Utility.SafeCommand
Utility.Scheduled