started git-annex recompute

The perform action of this still needs work to do the right thing.
In particular, it currently behaves as if --others was always set.
And, it duplicates a lot of code from addcomputed.
This commit is contained in:
Joey Hess 2025-02-26 11:25:32 -04:00
parent d49f371acc
commit 3bec89a3c3
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 304 additions and 65 deletions

View file

@ -134,6 +134,7 @@ import qualified Command.UpdateProxy
import qualified Command.MaxSize
import qualified Command.Sim
import qualified Command.AddComputed
import qualified Command.Recompute
import qualified Command.Version
import qualified Command.RemoteDaemon
#ifdef WITH_ASSISTANT
@ -267,6 +268,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexCommonOption
, Command.MaxSize.cmd
, Command.Sim.cmd
, Command.AddComputed.cmd
, Command.Recompute.cmd
, Command.Version.cmd
, Command.RemoteDaemon.cmd
#ifdef WITH_ASSISTANT

View file

@ -17,7 +17,6 @@ import qualified Types.Remote as Remote
import Annex.CatFile
import Annex.Content.Presence
import Annex.Ingest
import Types.RemoteConfig
import Types.KeySource
import Messages.Progress
import Logs.Location
@ -68,23 +67,20 @@ seek o = startConcurrency commandStages (seek' o)
seek' :: AddComputedOptions -> CommandSeek
seek' o = do
r <- getParsed (computeRemote o)
unless (Remote.typename (Remote.remotetype r) == Remote.typename Remote.Compute.remote) $
unless (Remote.Compute.isComputeRemote r) $
giveup "That is not a compute remote."
let rc = unparsedRemoteConfig (Remote.config r)
case Remote.Compute.getComputeProgram rc of
Left err -> giveup $
"Problem with the configuration of the compute remote: " ++ err
Right program -> commandAction $ start o r program
commandAction $ start o r
start :: AddComputedOptions -> Remote -> Remote.Compute.ComputeProgram -> CommandStart
start o r program = starting "addcomputed" ai si $ perform o r program
start :: AddComputedOptions -> Remote -> CommandStart
start o r = starting "addcomputed" ai si $ perform o r
where
ai = ActionItemUUID (Remote.uuid r) (UnquotedString (Remote.name r))
si = SeekInput (computeParams o)
perform :: AddComputedOptions -> Remote -> Remote.Compute.ComputeProgram -> CommandPerform
perform o r program = do
perform :: AddComputedOptions -> Remote -> CommandPerform
perform o r = do
program <- Remote.Compute.getComputeProgram r
repopath <- fromRepo Git.repoPath
subdir <- liftIO $ relPathDirToFile repopath (literalOsPath ".")
let state = Remote.Compute.ComputeState
@ -100,24 +96,10 @@ perform o r program = do
showOutput
Remote.Compute.runComputeProgram program state
(Remote.Compute.ImmutableState False)
(getinputcontent fast)
(getInputContent fast)
(go starttime fast)
next $ return True
where
getinputcontent fast p = catKeyFile p >>= \case
Just inputkey -> do
obj <- calcRepo (gitAnnexLocation inputkey)
if fast
then return (inputkey, Nothing)
else ifM (inAnnex inputkey)
( return (inputkey, Just obj)
, giveup $ "The computation needs the content of a file which is not present: " ++ fromOsPath p
)
Nothing -> ifM (liftIO $ doesFileExist p)
( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p
, giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p
)
go starttime fast state tmpdir = do
endtime <- liftIO currentMonotonicTimestamp
let ts = calcduration starttime endtime
@ -175,3 +157,18 @@ perform o r program = do
isreproducible state = case reproducible o of
Just v -> isReproducible v
Nothing -> Remote.Compute.computeReproducible state
getInputContent :: Bool -> OsPath -> Annex (Key, Maybe OsPath)
getInputContent fast p = catKeyFile p >>= \case
Just inputkey -> do
obj <- calcRepo (gitAnnexLocation inputkey)
if fast
then return (inputkey, Nothing)
else ifM (inAnnex inputkey)
( return (inputkey, Just obj)
, giveup $ "The computation needs the content of a file which is not present: " ++ fromOsPath p
)
Nothing -> ifM (liftIO $ doesFileExist p)
( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p
, giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p
)

202
Command/Recompute.hs Normal file
View file

@ -0,0 +1,202 @@
{- git-annex command
-
- Copyright 2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Command.Recompute where
import Command
import qualified Git
import qualified Annex
import qualified Remote.Compute
import qualified Remote
import qualified Types.Remote as Remote
import Annex.CatFile
import Annex.Content.Presence
import Annex.Ingest
import Git.FilePath
import Types.RemoteConfig
import Types.KeySource
import Messages.Progress
import Logs.Location
import Utility.Metered
import Utility.MonotonicClock
import Backend.URL (fromUrl)
import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent)
import qualified Data.Map as M
import Data.Time.Clock
cmd :: Command
cmd = notBareRepo $
command "recompute" SectionCommon "recompute computed files"
paramPaths (seek <$$> optParser)
data RecomputeOptions = RecomputeOptions
{ recomputeThese :: CmdParams
, originalOption :: Bool
, othersOption :: Bool
, reproducible :: Maybe Reproducible
, computeRemote :: Maybe (DeferredParse Remote)
}
optParser :: CmdParamsDesc -> Parser RecomputeOptions
optParser desc = RecomputeOptions
<$> cmdParams desc
<*> switch
( long "original"
<> help "recompute using original content of input files"
)
<*> switch
( long "others"
<> help "stage other files that are recomputed in passing"
)
<*> parseReproducible
<*> optional (mkParseRemoteOption <$> parseRemoteOption)
seek :: RecomputeOptions -> CommandSeek
seek o = startConcurrency commandStages (seek' o)
seek' :: RecomputeOptions -> CommandSeek
seek' o = do
computeremote <- maybe (pure Nothing) (Just <$$> getParsed)
(computeRemote o)
let seeker = AnnexedFileSeeker
{ startAction = const $ start o computeremote
, checkContentPresent = Nothing
, usesLocationLog = True
}
withFilesInGitAnnex ww seeker
=<< workTreeItems ww (recomputeThese o)
where
ww = WarnUnmatchLsFiles "recompute"
start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart
start o (Just computeremote) si file key =
stopUnless (notElem (Remote.uuid computeremote) <$> loggedLocations key) $
start' o computeremote si file key
start o Nothing si file key = do
rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key)
case sortOn Remote.cost $ filter Remote.Compute.isComputeRemote rs of
[] -> stop
(r:_) -> start' o r si file key
start' :: RecomputeOptions -> Remote -> SeekInput -> OsPath -> Key -> CommandStart
start' o r si file key =
Remote.Compute.getComputeState
(Remote.remoteStateHandle r) key >>= \case
Nothing -> stop
Just state ->
stopUnless (shouldrecompute state) $
starting "recompute" ai si $
perform o r file key state
where
ai = mkActionItem (key, file)
shouldrecompute state
| originalOption o = return True
| otherwise =
anyM (inputchanged state) $
M.toList (Remote.Compute.computeInputs state)
inputchanged state (inputfile, inputkey) = do
-- Note that the paths from the remote state are not to be
-- trusted to point to a file in the repository, but using
-- the path with catKeyFile will only succeed if it
-- is checked into the repository.
p <- fromRepo $ fromTopFilePath $ asTopFilePath $
Remote.Compute.computeSubdir state </> inputfile
catKeyFile p >>= return . \case
Just k -> k /= inputkey
-- When an input file is missing, go ahead and
-- recompute. This way, the user will see the
-- computation fail, with an error message that
-- explains the problem.
-- XXX check that this works well
Nothing -> True
perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform
perform o r file key oldstate = do
program <- Remote.Compute.getComputeProgram r
let recomputestate = oldstate
{ Remote.Compute.computeInputs = mempty
, Remote.Compute.computeOutputs = mempty
}
fast <- Annex.getRead Annex.fast
starttime <- liftIO currentMonotonicTimestamp
showOutput
Remote.Compute.runComputeProgram program recomputestate
(Remote.Compute.ImmutableState False)
(getinputcontent program fast)
(go starttime fast)
next $ return True
where
getinputcontent program fast p
| originalOption o =
case M.lookup p (Remote.Compute.computeInputs oldstate) of
Just inputkey -> return (inputkey, Nothing)
Nothing -> Remote.Compute.computationBehaviorChangeError program
"requesting a new input file" p
| otherwise = getInputContent fast p
go starttime fast state tmpdir = do
endtime <- liftIO currentMonotonicTimestamp
let ts = calcduration starttime endtime
let outputs = Remote.Compute.computeOutputs state
when (M.null outputs) $
giveup "The computation succeeded, but it did not generate any files."
oks <- forM (M.keys outputs) $ \outputfile -> do
showAction $ "adding " <> QuotedPath outputfile
k <- catchNonAsync (addfile fast state tmpdir outputfile)
(\err -> giveup $ "Failed to ingest output file " ++ fromOsPath outputfile ++ ": " ++ show err)
return (outputfile, Just k)
let state' = state
{ Remote.Compute.computeOutputs = M.fromList oks
}
forM_ (mapMaybe snd oks) $ \k -> do
Remote.Compute.setComputeState
(Remote.remoteStateHandle r)
k ts state'
logChange NoLiveUpdate k (Remote.uuid r) InfoPresent
addfile fast state tmpdir outputfile
| fast = do
addSymlink outputfile stateurlk Nothing
return stateurlk
| isreproducible state = do
sz <- liftIO $ getFileSize outputfile'
metered Nothing sz Nothing $ \_ p ->
ingestwith $ ingestAdd p (Just ld)
| otherwise = ingestwith $
ingestAdd' nullMeterUpdate (Just ld) (Just stateurlk)
where
stateurl = Remote.Compute.computeStateUrl r state outputfile
stateurlk = fromUrl stateurl Nothing True
outputfile' = tmpdir </> outputfile
ld = LockedDown ldc $ KeySource
{ keyFilename = outputfile
, contentLocation = outputfile'
, inodeCache = Nothing
}
ingestwith a = a >>= \case
Nothing -> giveup "key generation failed"
Just k -> do
logStatus NoLiveUpdate k InfoPresent
return k
ldc = LockDownConfig
{ lockingFile = True
, hardlinkFileTmpDir = Nothing
, checkWritePerms = True
}
calcduration (MonotonicTimestamp starttime) (MonotonicTimestamp endtime) =
fromIntegral (endtime - starttime) :: NominalDiffTime
isreproducible state = case reproducible o of
Just v -> isReproducible v
Nothing -> Remote.Compute.computeReproducible state

View file

@ -9,14 +9,16 @@
module Remote.Compute (
remote,
isComputeRemote,
ComputeState(..),
setComputeState,
getComputeStates,
getComputeState,
computeStateUrl,
ComputeProgram,
getComputeProgram,
runComputeProgram,
ImmutableState(..),
computationBehaviorChangeError,
defaultComputeParams,
) where
@ -63,8 +65,11 @@ remote = RemoteType
, thirdPartyPopulated = False
}
isComputeRemote :: Remote -> Bool
isComputeRemote r = typename (remotetype r) == typename remote
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote)
gen r u rc gc rs = case getComputeProgram rc of
gen r u rc gc rs = case getComputeProgram' rc of
Left _err -> return Nothing
Right program -> do
c <- parsedRemoteConfig remote rc
@ -107,7 +112,7 @@ gen r u rc gc rs = case getComputeProgram rc of
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
setupInstance _ mu _ c _ = do
ComputeProgram program <- either giveup return (getComputeProgram c)
ComputeProgram program <- either giveup return $ getComputeProgram' c
unlessM (liftIO $ inSearchPath program) $
giveup $ "Cannot find " ++ program ++ " in PATH"
u <- maybe (liftIO genUUID) return mu
@ -136,8 +141,15 @@ defaultComputeParams = map mk . M.toList . getRemoteConfigPassedThrough . config
newtype ComputeProgram = ComputeProgram String
deriving (Show)
getComputeProgram :: RemoteConfig -> Either String ComputeProgram
getComputeProgram c = case fromProposedAccepted <$> M.lookup programField c of
getComputeProgram :: Remote -> Annex ComputeProgram
getComputeProgram r =
case getComputeProgram' (unparsedRemoteConfig (config r)) of
Right program -> return program
Left err -> giveup $
"Problem with the configuration of compute remote " ++ name r ++ ": " ++ err
getComputeProgram' :: RemoteConfig -> Either String ComputeProgram
getComputeProgram' c = case fromProposedAccepted <$> M.lookup programField c of
Just program
| safetyPrefix `isPrefixOf` program ->
Right (ComputeProgram program)
@ -285,8 +297,15 @@ setComputeState rs k ts st = addRemoteMetaData k rs $ MetaData $ M.singleton
(mkMetaFieldUnchecked $ T.pack ('t':show (truncateResolution 1 ts)))
(S.singleton (MetaValue (CurrentlySet True) (formatComputeState k st)))
getComputeStates :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)]
getComputeStates rs k = do
{- When multiple ComputeStates have been recorded for the same key,
- this returns one that is probably less expensive to compute,
- based on the original time it took to compute it. -}
getComputeState:: RemoteStateHandle -> Key -> Annex (Maybe ComputeState)
getComputeState rs k = headMaybe . map snd . sortOn fst
<$> getComputeStatesUnsorted rs k
getComputeStatesUnsorted :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)]
getComputeStatesUnsorted rs k = do
RemoteMetaData _ (MetaData m) <- getCurrentRemoteMetaData rs k
return $ go [] (M.toList m)
where
@ -369,7 +388,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
let f' = toOsPath f
let knowninput = M.member f' (computeInputs state')
checksafefile tmpdir subdir f' "input"
checkimmutable knowninput l $ do
checkimmutable knowninput "inputting" f' $ do
(k, mp) <- getinputcontent f'
mp' <- liftIO $ maybe (pure Nothing)
(Just <$$> relPathDirToFile subdir)
@ -388,7 +407,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
let f' = toOsPath f
checksafefile tmpdir subdir f' "output"
let knownoutput = M.member f' (computeOutputs state')
checkimmutable knownoutput l $
checkimmutable knownoutput "outputting" f' $
return $ if knownoutput
then state'
else state'
@ -412,25 +431,31 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
when (any (\p -> dropTrailingPathSeparator p == literalOsPath ".git") (splitPath f)) $
err "inside the .git directory"
checkimmutable True _ a = a
checkimmutable False l a
checkimmutable True _ _ a = a
checkimmutable False requestdesc p a
| not immutablestate = a
| otherwise = giveup $
program ++ " is not behaving the same way it used to, now outputting: \"" ++ l ++ "\""
| otherwise = computationBehaviorChangeError (ComputeProgram program) requestdesc p
computationBehaviorChangeError :: ComputeProgram -> String -> OsPath -> Annex a
computationBehaviorChangeError (ComputeProgram program) requestdesc p =
giveup $ program ++ " is not behaving the same way it used to, now " ++ requestdesc ++ ": " ++ fromOsPath p
computeKey :: RemoteStateHandle -> ComputeProgram -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification
computeKey rs (ComputeProgram program) k af dest p vc = do
states <- map snd . sortOn fst -- least expensive probably
<$> getComputeStates rs k
case mapMaybe computeskey states of
((keyfile, state):_) -> runComputeProgram
(ComputeProgram program)
state
(ImmutableState True)
(getinputcontent state)
(go keyfile)
[] -> giveup "Missing compute state"
computeKey rs (ComputeProgram program) k af dest p vc =
getComputeState rs k >>= \case
Just state ->
case computeskey state of
Just keyfile -> runComputeProgram
(ComputeProgram program)
state
(ImmutableState True)
(getinputcontent state)
(go keyfile)
Nothing -> missingstate
Nothing -> missingstate
where
missingstate = giveup "Missing compute state"
getinputcontent state f =
case M.lookup (fromOsPath f) (computeInputs state) of
Just inputkey -> do
@ -441,7 +466,7 @@ computeKey rs (ComputeProgram program) k af dest p vc = do
computeskey state =
case M.keys $ M.filter (== Just k) (computeOutputs state) of
(keyfile : _) -> Just (keyfile, state)
(keyfile : _) -> Just keyfile
[] -> Nothing
go keyfile state tmpdir = do
@ -470,7 +495,7 @@ computeKey rs (ComputeProgram program) k af dest p vc = do
-- Make sure that the compute state exists.
checkKey :: RemoteStateHandle -> Key -> Annex Bool
checkKey rs k = do
states <- getComputeStates rs k
states <- getComputeStatesUnsorted rs k
if null states
then giveup "Missing compute state"
else return True

View file

@ -78,9 +78,9 @@ the parameters provided to `git-annex addcomputed`.
reproducible output (except when using `--fast`).
If a computation turns out not to be fully reproducible, then getting
the file from the compute remote will later fail with a checksum
verification error. One thing that can be done then is to use
`git-annex recompute --unreproducible`.
a computed file from the compute remote will later fail with a
checksum verification error. One thing that can be done then is to use
`git-annex recompute --original --unreproducible`.
* Also the [[git-annex-common-options]](1) can be used.

View file

@ -1,6 +1,6 @@
# NAME
git-annex recompute - update computed files
git-annex recompute - recompute computed files
# SYNOPSIS
@ -9,18 +9,24 @@ git-annex recompute [path ...]`
# DESCRIPTION
This updates computed files that were added with
[[git-annex-addcomputed]](1).
[[git-annex-addcomputed]](1).
When the output of the computation is different, the updated computed
file is staged in the repository.
By default, this only recomputes files whose input files have changed.
The new contents of the input files are used to re-run the computation,
and when the output is different, the updated computed file is staged
in the repository.
The new contents of the input files are used to re-run the computation.
# OPTIONS
* `--unchanged`
* `--original`
Recompute files even when their input files have not changed.
Use the original content of input files.
* `--others`
When recomputing one file also generates new versions of other files,
stage those other files in the repository too.
* `--unreproducible`, `-u`
@ -32,14 +38,20 @@ in the repository.
Convert files that were added with `git-annex addcomputed --unreproducible`
to be as if they were added with `--reproducible`.
* `--remote=name`
Only recompute files that were computed by this compute remote.
When this option is not used, all computed files are recomputed using
whatever compute remote was originally used to add them. In cases where
a file can be computed by multiple remotes, the one with the lowest
configured cost will be used.
* matching options
The [[git-annex-matching-options]](1) can be used to control what
files to recompute.
For example, to only recompute files that are computed by the "photoconv"
compute remote, use `--in=photoconv`
* Also the [[git-annex-common-options]](1) can be used.
# SEE ALSO

View file

@ -728,6 +728,7 @@ Executable git-annex
Command.Proxy
Command.Pull
Command.Push
Command.Recompute
Command.ReKey
Command.ReadPresentKey
Command.RecvKey