started git-annex recompute

The perform action of this still needs work to do the right thing.
In particular, it currently behaves as if --others was always set.
And, it duplicates a lot of code from addcomputed.
This commit is contained in:
Joey Hess 2025-02-26 11:25:32 -04:00
parent d49f371acc
commit 3bec89a3c3
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 304 additions and 65 deletions

View file

@ -134,6 +134,7 @@ import qualified Command.UpdateProxy
import qualified Command.MaxSize import qualified Command.MaxSize
import qualified Command.Sim import qualified Command.Sim
import qualified Command.AddComputed import qualified Command.AddComputed
import qualified Command.Recompute
import qualified Command.Version import qualified Command.Version
import qualified Command.RemoteDaemon import qualified Command.RemoteDaemon
#ifdef WITH_ASSISTANT #ifdef WITH_ASSISTANT
@ -267,6 +268,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexCommonOption
, Command.MaxSize.cmd , Command.MaxSize.cmd
, Command.Sim.cmd , Command.Sim.cmd
, Command.AddComputed.cmd , Command.AddComputed.cmd
, Command.Recompute.cmd
, Command.Version.cmd , Command.Version.cmd
, Command.RemoteDaemon.cmd , Command.RemoteDaemon.cmd
#ifdef WITH_ASSISTANT #ifdef WITH_ASSISTANT

View file

@ -17,7 +17,6 @@ import qualified Types.Remote as Remote
import Annex.CatFile import Annex.CatFile
import Annex.Content.Presence import Annex.Content.Presence
import Annex.Ingest import Annex.Ingest
import Types.RemoteConfig
import Types.KeySource import Types.KeySource
import Messages.Progress import Messages.Progress
import Logs.Location import Logs.Location
@ -68,23 +67,20 @@ seek o = startConcurrency commandStages (seek' o)
seek' :: AddComputedOptions -> CommandSeek seek' :: AddComputedOptions -> CommandSeek
seek' o = do seek' o = do
r <- getParsed (computeRemote o) r <- getParsed (computeRemote o)
unless (Remote.typename (Remote.remotetype r) == Remote.typename Remote.Compute.remote) $ unless (Remote.Compute.isComputeRemote r) $
giveup "That is not a compute remote." giveup "That is not a compute remote."
let rc = unparsedRemoteConfig (Remote.config r) commandAction $ start o r
case Remote.Compute.getComputeProgram rc of
Left err -> giveup $
"Problem with the configuration of the compute remote: " ++ err
Right program -> commandAction $ start o r program
start :: AddComputedOptions -> Remote -> Remote.Compute.ComputeProgram -> CommandStart start :: AddComputedOptions -> Remote -> CommandStart
start o r program = starting "addcomputed" ai si $ perform o r program start o r = starting "addcomputed" ai si $ perform o r
where where
ai = ActionItemUUID (Remote.uuid r) (UnquotedString (Remote.name r)) ai = ActionItemUUID (Remote.uuid r) (UnquotedString (Remote.name r))
si = SeekInput (computeParams o) si = SeekInput (computeParams o)
perform :: AddComputedOptions -> Remote -> Remote.Compute.ComputeProgram -> CommandPerform perform :: AddComputedOptions -> Remote -> CommandPerform
perform o r program = do perform o r = do
program <- Remote.Compute.getComputeProgram r
repopath <- fromRepo Git.repoPath repopath <- fromRepo Git.repoPath
subdir <- liftIO $ relPathDirToFile repopath (literalOsPath ".") subdir <- liftIO $ relPathDirToFile repopath (literalOsPath ".")
let state = Remote.Compute.ComputeState let state = Remote.Compute.ComputeState
@ -100,24 +96,10 @@ perform o r program = do
showOutput showOutput
Remote.Compute.runComputeProgram program state Remote.Compute.runComputeProgram program state
(Remote.Compute.ImmutableState False) (Remote.Compute.ImmutableState False)
(getinputcontent fast) (getInputContent fast)
(go starttime fast) (go starttime fast)
next $ return True next $ return True
where where
getinputcontent fast p = catKeyFile p >>= \case
Just inputkey -> do
obj <- calcRepo (gitAnnexLocation inputkey)
if fast
then return (inputkey, Nothing)
else ifM (inAnnex inputkey)
( return (inputkey, Just obj)
, giveup $ "The computation needs the content of a file which is not present: " ++ fromOsPath p
)
Nothing -> ifM (liftIO $ doesFileExist p)
( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p
, giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p
)
go starttime fast state tmpdir = do go starttime fast state tmpdir = do
endtime <- liftIO currentMonotonicTimestamp endtime <- liftIO currentMonotonicTimestamp
let ts = calcduration starttime endtime let ts = calcduration starttime endtime
@ -175,3 +157,18 @@ perform o r program = do
isreproducible state = case reproducible o of isreproducible state = case reproducible o of
Just v -> isReproducible v Just v -> isReproducible v
Nothing -> Remote.Compute.computeReproducible state Nothing -> Remote.Compute.computeReproducible state
getInputContent :: Bool -> OsPath -> Annex (Key, Maybe OsPath)
getInputContent fast p = catKeyFile p >>= \case
Just inputkey -> do
obj <- calcRepo (gitAnnexLocation inputkey)
if fast
then return (inputkey, Nothing)
else ifM (inAnnex inputkey)
( return (inputkey, Just obj)
, giveup $ "The computation needs the content of a file which is not present: " ++ fromOsPath p
)
Nothing -> ifM (liftIO $ doesFileExist p)
( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p
, giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p
)

202
Command/Recompute.hs Normal file
View file

@ -0,0 +1,202 @@
{- git-annex command
-
- Copyright 2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Command.Recompute where
import Command
import qualified Git
import qualified Annex
import qualified Remote.Compute
import qualified Remote
import qualified Types.Remote as Remote
import Annex.CatFile
import Annex.Content.Presence
import Annex.Ingest
import Git.FilePath
import Types.RemoteConfig
import Types.KeySource
import Messages.Progress
import Logs.Location
import Utility.Metered
import Utility.MonotonicClock
import Backend.URL (fromUrl)
import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent)
import qualified Data.Map as M
import Data.Time.Clock
cmd :: Command
cmd = notBareRepo $
command "recompute" SectionCommon "recompute computed files"
paramPaths (seek <$$> optParser)
data RecomputeOptions = RecomputeOptions
{ recomputeThese :: CmdParams
, originalOption :: Bool
, othersOption :: Bool
, reproducible :: Maybe Reproducible
, computeRemote :: Maybe (DeferredParse Remote)
}
optParser :: CmdParamsDesc -> Parser RecomputeOptions
optParser desc = RecomputeOptions
<$> cmdParams desc
<*> switch
( long "original"
<> help "recompute using original content of input files"
)
<*> switch
( long "others"
<> help "stage other files that are recomputed in passing"
)
<*> parseReproducible
<*> optional (mkParseRemoteOption <$> parseRemoteOption)
seek :: RecomputeOptions -> CommandSeek
seek o = startConcurrency commandStages (seek' o)
seek' :: RecomputeOptions -> CommandSeek
seek' o = do
computeremote <- maybe (pure Nothing) (Just <$$> getParsed)
(computeRemote o)
let seeker = AnnexedFileSeeker
{ startAction = const $ start o computeremote
, checkContentPresent = Nothing
, usesLocationLog = True
}
withFilesInGitAnnex ww seeker
=<< workTreeItems ww (recomputeThese o)
where
ww = WarnUnmatchLsFiles "recompute"
start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart
start o (Just computeremote) si file key =
stopUnless (notElem (Remote.uuid computeremote) <$> loggedLocations key) $
start' o computeremote si file key
start o Nothing si file key = do
rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key)
case sortOn Remote.cost $ filter Remote.Compute.isComputeRemote rs of
[] -> stop
(r:_) -> start' o r si file key
start' :: RecomputeOptions -> Remote -> SeekInput -> OsPath -> Key -> CommandStart
start' o r si file key =
Remote.Compute.getComputeState
(Remote.remoteStateHandle r) key >>= \case
Nothing -> stop
Just state ->
stopUnless (shouldrecompute state) $
starting "recompute" ai si $
perform o r file key state
where
ai = mkActionItem (key, file)
shouldrecompute state
| originalOption o = return True
| otherwise =
anyM (inputchanged state) $
M.toList (Remote.Compute.computeInputs state)
inputchanged state (inputfile, inputkey) = do
-- Note that the paths from the remote state are not to be
-- trusted to point to a file in the repository, but using
-- the path with catKeyFile will only succeed if it
-- is checked into the repository.
p <- fromRepo $ fromTopFilePath $ asTopFilePath $
Remote.Compute.computeSubdir state </> inputfile
catKeyFile p >>= return . \case
Just k -> k /= inputkey
-- When an input file is missing, go ahead and
-- recompute. This way, the user will see the
-- computation fail, with an error message that
-- explains the problem.
-- XXX check that this works well
Nothing -> True
perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform
perform o r file key oldstate = do
program <- Remote.Compute.getComputeProgram r
let recomputestate = oldstate
{ Remote.Compute.computeInputs = mempty
, Remote.Compute.computeOutputs = mempty
}
fast <- Annex.getRead Annex.fast
starttime <- liftIO currentMonotonicTimestamp
showOutput
Remote.Compute.runComputeProgram program recomputestate
(Remote.Compute.ImmutableState False)
(getinputcontent program fast)
(go starttime fast)
next $ return True
where
getinputcontent program fast p
| originalOption o =
case M.lookup p (Remote.Compute.computeInputs oldstate) of
Just inputkey -> return (inputkey, Nothing)
Nothing -> Remote.Compute.computationBehaviorChangeError program
"requesting a new input file" p
| otherwise = getInputContent fast p
go starttime fast state tmpdir = do
endtime <- liftIO currentMonotonicTimestamp
let ts = calcduration starttime endtime
let outputs = Remote.Compute.computeOutputs state
when (M.null outputs) $
giveup "The computation succeeded, but it did not generate any files."
oks <- forM (M.keys outputs) $ \outputfile -> do
showAction $ "adding " <> QuotedPath outputfile
k <- catchNonAsync (addfile fast state tmpdir outputfile)
(\err -> giveup $ "Failed to ingest output file " ++ fromOsPath outputfile ++ ": " ++ show err)
return (outputfile, Just k)
let state' = state
{ Remote.Compute.computeOutputs = M.fromList oks
}
forM_ (mapMaybe snd oks) $ \k -> do
Remote.Compute.setComputeState
(Remote.remoteStateHandle r)
k ts state'
logChange NoLiveUpdate k (Remote.uuid r) InfoPresent
addfile fast state tmpdir outputfile
| fast = do
addSymlink outputfile stateurlk Nothing
return stateurlk
| isreproducible state = do
sz <- liftIO $ getFileSize outputfile'
metered Nothing sz Nothing $ \_ p ->
ingestwith $ ingestAdd p (Just ld)
| otherwise = ingestwith $
ingestAdd' nullMeterUpdate (Just ld) (Just stateurlk)
where
stateurl = Remote.Compute.computeStateUrl r state outputfile
stateurlk = fromUrl stateurl Nothing True
outputfile' = tmpdir </> outputfile
ld = LockedDown ldc $ KeySource
{ keyFilename = outputfile
, contentLocation = outputfile'
, inodeCache = Nothing
}
ingestwith a = a >>= \case
Nothing -> giveup "key generation failed"
Just k -> do
logStatus NoLiveUpdate k InfoPresent
return k
ldc = LockDownConfig
{ lockingFile = True
, hardlinkFileTmpDir = Nothing
, checkWritePerms = True
}
calcduration (MonotonicTimestamp starttime) (MonotonicTimestamp endtime) =
fromIntegral (endtime - starttime) :: NominalDiffTime
isreproducible state = case reproducible o of
Just v -> isReproducible v
Nothing -> Remote.Compute.computeReproducible state

View file

@ -9,14 +9,16 @@
module Remote.Compute ( module Remote.Compute (
remote, remote,
isComputeRemote,
ComputeState(..), ComputeState(..),
setComputeState, setComputeState,
getComputeStates, getComputeState,
computeStateUrl, computeStateUrl,
ComputeProgram, ComputeProgram,
getComputeProgram, getComputeProgram,
runComputeProgram, runComputeProgram,
ImmutableState(..), ImmutableState(..),
computationBehaviorChangeError,
defaultComputeParams, defaultComputeParams,
) where ) where
@ -63,8 +65,11 @@ remote = RemoteType
, thirdPartyPopulated = False , thirdPartyPopulated = False
} }
isComputeRemote :: Remote -> Bool
isComputeRemote r = typename (remotetype r) == typename remote
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote) gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote)
gen r u rc gc rs = case getComputeProgram rc of gen r u rc gc rs = case getComputeProgram' rc of
Left _err -> return Nothing Left _err -> return Nothing
Right program -> do Right program -> do
c <- parsedRemoteConfig remote rc c <- parsedRemoteConfig remote rc
@ -107,7 +112,7 @@ gen r u rc gc rs = case getComputeProgram rc of
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
setupInstance _ mu _ c _ = do setupInstance _ mu _ c _ = do
ComputeProgram program <- either giveup return (getComputeProgram c) ComputeProgram program <- either giveup return $ getComputeProgram' c
unlessM (liftIO $ inSearchPath program) $ unlessM (liftIO $ inSearchPath program) $
giveup $ "Cannot find " ++ program ++ " in PATH" giveup $ "Cannot find " ++ program ++ " in PATH"
u <- maybe (liftIO genUUID) return mu u <- maybe (liftIO genUUID) return mu
@ -136,8 +141,15 @@ defaultComputeParams = map mk . M.toList . getRemoteConfigPassedThrough . config
newtype ComputeProgram = ComputeProgram String newtype ComputeProgram = ComputeProgram String
deriving (Show) deriving (Show)
getComputeProgram :: RemoteConfig -> Either String ComputeProgram getComputeProgram :: Remote -> Annex ComputeProgram
getComputeProgram c = case fromProposedAccepted <$> M.lookup programField c of getComputeProgram r =
case getComputeProgram' (unparsedRemoteConfig (config r)) of
Right program -> return program
Left err -> giveup $
"Problem with the configuration of compute remote " ++ name r ++ ": " ++ err
getComputeProgram' :: RemoteConfig -> Either String ComputeProgram
getComputeProgram' c = case fromProposedAccepted <$> M.lookup programField c of
Just program Just program
| safetyPrefix `isPrefixOf` program -> | safetyPrefix `isPrefixOf` program ->
Right (ComputeProgram program) Right (ComputeProgram program)
@ -285,8 +297,15 @@ setComputeState rs k ts st = addRemoteMetaData k rs $ MetaData $ M.singleton
(mkMetaFieldUnchecked $ T.pack ('t':show (truncateResolution 1 ts))) (mkMetaFieldUnchecked $ T.pack ('t':show (truncateResolution 1 ts)))
(S.singleton (MetaValue (CurrentlySet True) (formatComputeState k st))) (S.singleton (MetaValue (CurrentlySet True) (formatComputeState k st)))
getComputeStates :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)] {- When multiple ComputeStates have been recorded for the same key,
getComputeStates rs k = do - this returns one that is probably less expensive to compute,
- based on the original time it took to compute it. -}
getComputeState:: RemoteStateHandle -> Key -> Annex (Maybe ComputeState)
getComputeState rs k = headMaybe . map snd . sortOn fst
<$> getComputeStatesUnsorted rs k
getComputeStatesUnsorted :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)]
getComputeStatesUnsorted rs k = do
RemoteMetaData _ (MetaData m) <- getCurrentRemoteMetaData rs k RemoteMetaData _ (MetaData m) <- getCurrentRemoteMetaData rs k
return $ go [] (M.toList m) return $ go [] (M.toList m)
where where
@ -369,7 +388,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
let f' = toOsPath f let f' = toOsPath f
let knowninput = M.member f' (computeInputs state') let knowninput = M.member f' (computeInputs state')
checksafefile tmpdir subdir f' "input" checksafefile tmpdir subdir f' "input"
checkimmutable knowninput l $ do checkimmutable knowninput "inputting" f' $ do
(k, mp) <- getinputcontent f' (k, mp) <- getinputcontent f'
mp' <- liftIO $ maybe (pure Nothing) mp' <- liftIO $ maybe (pure Nothing)
(Just <$$> relPathDirToFile subdir) (Just <$$> relPathDirToFile subdir)
@ -388,7 +407,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
let f' = toOsPath f let f' = toOsPath f
checksafefile tmpdir subdir f' "output" checksafefile tmpdir subdir f' "output"
let knownoutput = M.member f' (computeOutputs state') let knownoutput = M.member f' (computeOutputs state')
checkimmutable knownoutput l $ checkimmutable knownoutput "outputting" f' $
return $ if knownoutput return $ if knownoutput
then state' then state'
else state' else state'
@ -412,25 +431,31 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
when (any (\p -> dropTrailingPathSeparator p == literalOsPath ".git") (splitPath f)) $ when (any (\p -> dropTrailingPathSeparator p == literalOsPath ".git") (splitPath f)) $
err "inside the .git directory" err "inside the .git directory"
checkimmutable True _ a = a checkimmutable True _ _ a = a
checkimmutable False l a checkimmutable False requestdesc p a
| not immutablestate = a | not immutablestate = a
| otherwise = giveup $ | otherwise = computationBehaviorChangeError (ComputeProgram program) requestdesc p
program ++ " is not behaving the same way it used to, now outputting: \"" ++ l ++ "\""
computationBehaviorChangeError :: ComputeProgram -> String -> OsPath -> Annex a
computationBehaviorChangeError (ComputeProgram program) requestdesc p =
giveup $ program ++ " is not behaving the same way it used to, now " ++ requestdesc ++ ": " ++ fromOsPath p
computeKey :: RemoteStateHandle -> ComputeProgram -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification computeKey :: RemoteStateHandle -> ComputeProgram -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification
computeKey rs (ComputeProgram program) k af dest p vc = do computeKey rs (ComputeProgram program) k af dest p vc =
states <- map snd . sortOn fst -- least expensive probably getComputeState rs k >>= \case
<$> getComputeStates rs k Just state ->
case mapMaybe computeskey states of case computeskey state of
((keyfile, state):_) -> runComputeProgram Just keyfile -> runComputeProgram
(ComputeProgram program) (ComputeProgram program)
state state
(ImmutableState True) (ImmutableState True)
(getinputcontent state) (getinputcontent state)
(go keyfile) (go keyfile)
[] -> giveup "Missing compute state" Nothing -> missingstate
Nothing -> missingstate
where where
missingstate = giveup "Missing compute state"
getinputcontent state f = getinputcontent state f =
case M.lookup (fromOsPath f) (computeInputs state) of case M.lookup (fromOsPath f) (computeInputs state) of
Just inputkey -> do Just inputkey -> do
@ -441,7 +466,7 @@ computeKey rs (ComputeProgram program) k af dest p vc = do
computeskey state = computeskey state =
case M.keys $ M.filter (== Just k) (computeOutputs state) of case M.keys $ M.filter (== Just k) (computeOutputs state) of
(keyfile : _) -> Just (keyfile, state) (keyfile : _) -> Just keyfile
[] -> Nothing [] -> Nothing
go keyfile state tmpdir = do go keyfile state tmpdir = do
@ -470,7 +495,7 @@ computeKey rs (ComputeProgram program) k af dest p vc = do
-- Make sure that the compute state exists. -- Make sure that the compute state exists.
checkKey :: RemoteStateHandle -> Key -> Annex Bool checkKey :: RemoteStateHandle -> Key -> Annex Bool
checkKey rs k = do checkKey rs k = do
states <- getComputeStates rs k states <- getComputeStatesUnsorted rs k
if null states if null states
then giveup "Missing compute state" then giveup "Missing compute state"
else return True else return True

View file

@ -78,9 +78,9 @@ the parameters provided to `git-annex addcomputed`.
reproducible output (except when using `--fast`). reproducible output (except when using `--fast`).
If a computation turns out not to be fully reproducible, then getting If a computation turns out not to be fully reproducible, then getting
the file from the compute remote will later fail with a checksum a computed file from the compute remote will later fail with a
verification error. One thing that can be done then is to use checksum verification error. One thing that can be done then is to use
`git-annex recompute --unreproducible`. `git-annex recompute --original --unreproducible`.
* Also the [[git-annex-common-options]](1) can be used. * Also the [[git-annex-common-options]](1) can be used.

View file

@ -1,6 +1,6 @@
# NAME # NAME
git-annex recompute - update computed files git-annex recompute - recompute computed files
# SYNOPSIS # SYNOPSIS
@ -9,18 +9,24 @@ git-annex recompute [path ...]`
# DESCRIPTION # DESCRIPTION
This updates computed files that were added with This updates computed files that were added with
[[git-annex-addcomputed]](1). [[git-annex-addcomputed]](1).
When the output of the computation is different, the updated computed
file is staged in the repository.
By default, this only recomputes files whose input files have changed. By default, this only recomputes files whose input files have changed.
The new contents of the input files are used to re-run the computation, The new contents of the input files are used to re-run the computation.
and when the output is different, the updated computed file is staged
in the repository.
# OPTIONS # OPTIONS
* `--unchanged` * `--original`
Recompute files even when their input files have not changed. Use the original content of input files.
* `--others`
When recomputing one file also generates new versions of other files,
stage those other files in the repository too.
* `--unreproducible`, `-u` * `--unreproducible`, `-u`
@ -32,14 +38,20 @@ in the repository.
Convert files that were added with `git-annex addcomputed --unreproducible` Convert files that were added with `git-annex addcomputed --unreproducible`
to be as if they were added with `--reproducible`. to be as if they were added with `--reproducible`.
* `--remote=name`
Only recompute files that were computed by this compute remote.
When this option is not used, all computed files are recomputed using
whatever compute remote was originally used to add them. In cases where
a file can be computed by multiple remotes, the one with the lowest
configured cost will be used.
* matching options * matching options
The [[git-annex-matching-options]](1) can be used to control what The [[git-annex-matching-options]](1) can be used to control what
files to recompute. files to recompute.
For example, to only recompute files that are computed by the "photoconv"
compute remote, use `--in=photoconv`
* Also the [[git-annex-common-options]](1) can be used. * Also the [[git-annex-common-options]](1) can be used.
# SEE ALSO # SEE ALSO

View file

@ -728,6 +728,7 @@ Executable git-annex
Command.Proxy Command.Proxy
Command.Pull Command.Pull
Command.Push Command.Push
Command.Recompute
Command.ReKey Command.ReKey
Command.ReadPresentKey Command.ReadPresentKey
Command.RecvKey Command.RecvKey