Windows: Fix CRLF handling in some log files

In particular, the mergedrefs file was written with CR added to each line,
but read without CRLF handling. This resulted in each update of the file
adding CR to each line in it, growing the number of lines, while also
preventing the optimisation from working, so it remerged unncessarily.

writeFile and readFile do NewlineMode translation on Windows. But the
ByteString conversion prevented that from happening any longer.

I've audited for other cases of this, and found three more
(.git/annex/index.lck, .git/annex/ignoredrefs, and .git/annex/import/). All
of those also only prevent optimisations from working. Some other files are
currently both read and written with ByteString, but old git-annex may have
written them with NewlineMode translation. Other files are at risk for
breakage later if the reader gets converted to ByteString.

This is a minimal fix, but should be enough, as long as I remember to use
fileLines when splitting a ByteString into lines. This leaves files written
using ByteString without CR added, but that's ok because old git-annex has
no difficulty reading such files.

When the mergedrefs file has gotten lines that end with "\r\r\r\n", this
will eventually clean it up. Each update will remove a single trailing CR.

Note that S8.lines is still used in eg Command.Unused, where it is parsing
git show-ref, and similar in Git/*. git commands don't include CR in their
output so that's ok.

Sponsored-by: Joshua Antonishen on Patreon
This commit is contained in:
Joey Hess 2023-10-30 14:23:23 -04:00
parent ea2876ae77
commit eb42935e58
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 41 additions and 7 deletions

View file

@ -889,7 +889,7 @@ ignoreRefs rs = do
getIgnoredRefs :: Annex (S.Set Git.Sha) getIgnoredRefs :: Annex (S.Set Git.Sha)
getIgnoredRefs = getIgnoredRefs =
S.fromList . mapMaybe Git.Sha.extractSha . B8.lines <$> content S.fromList . mapMaybe Git.Sha.extractSha . fileLines' <$> content
where where
content = do content = do
f <- fromRawFilePath <$> fromRepo gitAnnexIgnoredRefs f <- fromRawFilePath <$> fromRepo gitAnnexIgnoredRefs
@ -912,7 +912,7 @@ getMergedRefs' :: Annex [(Git.Sha, Git.Branch)]
getMergedRefs' = do getMergedRefs' = do
f <- fromRawFilePath <$> fromRepo gitAnnexMergedRefs f <- fromRawFilePath <$> fromRepo gitAnnexMergedRefs
s <- liftIO $ catchDefaultIO mempty $ B.readFile f s <- liftIO $ catchDefaultIO mempty $ B.readFile f
return $ map parse $ B8.lines s return $ map parse $ fileLines' s
where where
parse l = parse l =
let (s, b) = separate' (== (fromIntegral (ord '\t'))) l let (s, b) = separate' (== (fromIntegral (ord '\t'))) l

View file

@ -5,6 +5,7 @@ git-annex (10.20230927) UNRELEASED; urgency=medium
* importfeed: Use caching database to avoid needing to list urls * importfeed: Use caching database to avoid needing to list urls
on every run, and avoid using too much memory. on every run, and avoid using too much memory.
* Improve memory use of --all when using annex.private. * Improve memory use of --all when using annex.private.
* Windows: Fix CRLF handling in some log files.
-- Joey Hess <id@joeyh.name> Tue, 10 Oct 2023 13:17:31 -0400 -- Joey Hess <id@joeyh.name> Tue, 10 Oct 2023 13:17:31 -0400

View file

@ -1,11 +1,11 @@
{- git-annex log files {- git-annex log files
- -
- Copyright 2018-2022 Joey Hess <id@joeyh.name> - Copyright 2018-2023 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP, BangPatterns #-}
module Logs.File ( module Logs.File (
writeLogFile, writeLogFile,
@ -17,6 +17,8 @@ module Logs.File (
checkLogFile, checkLogFile,
calcLogFile, calcLogFile,
calcLogFileUnsafe, calcLogFileUnsafe,
fileLines,
fileLines',
) where ) where
import Annex.Common import Annex.Common
@ -25,6 +27,8 @@ import Annex.LockFile
import Annex.ReplaceFile import Annex.ReplaceFile
import Utility.Tmp import Utility.Tmp
import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as L8 import qualified Data.ByteString.Lazy.Char8 as L8
@ -74,7 +78,7 @@ appendLogFile f lck c =
modifyLogFile :: RawFilePath -> RawFilePath -> ([L.ByteString] -> [L.ByteString]) -> Annex () modifyLogFile :: RawFilePath -> RawFilePath -> ([L.ByteString] -> [L.ByteString]) -> Annex ()
modifyLogFile f lck modf = withExclusiveLock lck $ do modifyLogFile f lck modf = withExclusiveLock lck $ do
ls <- liftIO $ fromMaybe [] ls <- liftIO $ fromMaybe []
<$> tryWhenExists (L8.lines <$> L.readFile f') <$> tryWhenExists (fileLines <$> L.readFile f')
let ls' = modf ls let ls' = modf ls
when (ls' /= ls) $ when (ls' /= ls) $
createDirWhenNeeded f $ createDirWhenNeeded f $
@ -94,7 +98,7 @@ checkLogFile f lck matchf = withSharedLock lck $ bracket setup cleanup go
cleanup (Just h) = liftIO $ hClose h cleanup (Just h) = liftIO $ hClose h
go Nothing = return False go Nothing = return False
go (Just h) = do go (Just h) = do
!r <- liftIO (any matchf . L8.lines <$> L.hGetContents h) !r <- liftIO (any matchf . fileLines <$> L.hGetContents h)
return r return r
f' = fromRawFilePath f f' = fromRawFilePath f
@ -111,7 +115,7 @@ calcLogFileUnsafe f start update = bracket setup cleanup go
cleanup Nothing = noop cleanup Nothing = noop
cleanup (Just h) = liftIO $ hClose h cleanup (Just h) = liftIO $ hClose h
go Nothing = return start go Nothing = return start
go (Just h) = go' start =<< liftIO (L8.lines <$> L.hGetContents h) go (Just h) = go' start =<< liftIO (fileLines <$> L.hGetContents h)
go' v [] = return v go' v [] = return v
go' v (l:ls) = do go' v (l:ls) = do
let !v' = update l v let !v' = update l v
@ -157,3 +161,32 @@ createDirWhenNeeded f a = a `catchNonAsync` \_e -> do
-- done if writing the file fails. -- done if writing the file fails.
createAnnexDirectory (parentDir f) createAnnexDirectory (parentDir f)
a a
-- On windows, readFile does NewlineMode translation,
-- stripping CR before LF. When converting to ByteString,
-- use this to emulate that.
fileLines :: L.ByteString -> [L.ByteString]
#ifdef mingw32_HOST_OS
fileLines = map stripCR . L8.lines
where
stripCR b = case L8.unsnoc b of
Nothing -> b
Just (b', e)
| e == '\r' -> b'
| otherwise -> b
#else
fileLines = L8.lines
#endif
fileLines' :: S.ByteString -> [S.ByteString]
#ifdef mingw32_HOST_OS
fileLines' = map stripCR . S8.lines
where
stripCR b = case S8.unsnoc b of
Nothing -> b
Just (b', e)
| e == '\r' -> b'
| otherwise -> b
#else
fileLines' = S8.lines
#endif