add tailVerify

Not yet used, but this will let all remotes verify incrementally if it's
acceptable to pay the performance price. See comment for details of when
it will perform badly. I anticipate using this for all special remotes
that use fileRetriever. Except perhaps for a few like GitLFS that could
feed the incremental verifier themselves despite using that.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2021-08-12 14:36:56 -04:00
parent 2e54564061
commit b6efba8139
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38

View file

@ -5,6 +5,8 @@
- Licensed under the GNU AGPL version 3 or higher. - Licensed under the GNU AGPL version 3 or higher.
-} -}
{-# LANGUAGE CPP #-}
module Annex.Verify ( module Annex.Verify (
VerifyConfig(..), VerifyConfig(..),
shouldVerify, shouldVerify,
@ -16,6 +18,7 @@ module Annex.Verify (
isVerifiable, isVerifiable,
startVerifyKeyContentIncrementally, startVerifyKeyContentIncrementally,
IncrementalVerifier(..), IncrementalVerifier(..),
tailVerify,
) where ) where
import Annex.Common import Annex.Common
@ -29,6 +32,12 @@ import Annex.WorkerPool
import Types.WorkerPool import Types.WorkerPool
import Types.Key import Types.Key
#if WITH_INOTIFY
import qualified System.INotify as INotify
import Control.Concurrent.STM
import qualified Data.ByteString as S
#endif
data VerifyConfig = AlwaysVerify | NoVerify | RemoteVerify Remote | DefaultVerify data VerifyConfig = AlwaysVerify | NoVerify | RemoteVerify Remote | DefaultVerify
shouldVerify :: VerifyConfig -> Annex Bool shouldVerify :: VerifyConfig -> Annex Bool
@ -112,3 +121,128 @@ startVerifyKeyContentIncrementally verifyconfig k =
Nothing -> return Nothing Nothing -> return Nothing
, return Nothing , return Nothing
) )
-- | Reads the file as it grows, and feeds it to the incremental verifier.
--
-- The TMVar must start out empty, and be filled once whatever is
-- writing to the file finishes.
--
-- The file does not need to exist yet when this is called. It will wait
-- for the file to appear before opening it and starting verification.
--
-- Note that there are situations where the file may fail to verify despite
-- having the correct content. For example, when the file is written out
-- of order, or gets replaced part way through. To deal with such cases,
-- when False is returned, it should not be treated as if the file's
-- content is known to be incorrect, but instead as an indication that the
-- file should be verified again, once it's done being written to.
--
-- Also, this is not supported for all OSs, and will return False on OS's
-- where it is not supported.
--
-- This should be fairly efficient, reading from the disk cache,
-- as long as the writer does not get very far ahead of it. However,
-- there are situations where it would be much less expensive to verify
-- chunks as they are being written. For example, when resuming with
-- a lot of content in the file, all that content needs to be read,
-- and if the disk is slow, the reader may never catch up to the writer,
-- and the disk cache may never speed up reads. So this should only be
-- used when there's not a better way to incrementally verify.
tailVerify :: IncrementalVerifier -> FilePath -> TMVar () -> IO Bool
#if WITH_INOTIFY
tailVerify iv f finished =
catchDefaultIO False $ INotify.withINotify $ \i ->
bracket (waitforfiletoexist i) (maybe noop hClose) (go i)
where
rf = toRawFilePath f
d = toRawFilePath (takeDirectory f)
waitforfiletoexist i = tryIO (openBinaryFile f ReadMode) >>= \case
Right h -> return (Just h)
Left _ -> do
hv <- newEmptyTMVarIO
wd <- inotifycreate i $
tryIO (openBinaryFile f ReadMode) >>= \case
Right h ->
unlessM (atomically $ tryPutTMVar hv h) $
hClose h
Left _ -> return ()
-- Wait for the file to appear, or for a signal
-- that the file is finished being written.
--
-- The TMVar is left full to prevent the file
-- being opened again if the inotify event
-- fires more than once.
v <- atomically $
(Just <$> readTMVar hv)
`orElse`
((const Nothing) <$> readTMVar finished)
INotify.removeWatch wd
return v
inotifycreate i cont = INotify.addWatch i evs d $ \case
-- Ignore changes to other files in the directory.
INotify.Created { INotify.filePath = fn }
| fn /= rf -> noop
INotify.MovedIn { INotify.filePath = fn }
| fn /= rf -> noop
INotify.Opened { INotify.maybeFilePath = fn }
| fn /= Just rf -> noop
INotify.Modified { INotify.maybeFilePath = fn }
| fn /= Just rf -> noop
_ -> cont
where
evs =
[ INotify.Create
, INotify.MoveIn
, INotify.Move
, INotify.Open
, INotify.Modify
]
go i (Just h) = do
modified <- newEmptyTMVarIO
wd <- INotify.addWatch i [INotify.Modify] rf $ \_event ->
atomically $ void $ tryPutTMVar modified ()
r <- follow h modified
INotify.removeWatch wd
return r
-- File never showed up, but we've been told it's done being
-- written to.
go _ Nothing = return False
follow h modified = do
b <- S.hGetNonBlocking h chunk
if S.null b
then do
-- We've caught up to the writer.
-- Wait for the file to get modified again,
-- or until we're told it is done being
-- written.
cont <- atomically $
((const (follow h modified))
<$> takeTMVar modified)
`orElse`
((const (finish h))
<$> readTMVar finished)
cont
else do
updateIncremental iv b
follow h modified
-- We've been told the file is done being written to, but we
-- may not have reached the end of it yet. Read until EOF.
finish h = do
b <- S.hGet h chunk
if S.null b
then finalizeIncremental iv
else do
updateIncremental iv b
finish h
chunk = 65536
#else
tailVerify _ _ _ = return False -- not supported
#endif