many recompute improvements

I've lost track of them all, but it includes:

* Using the same key backend as was used in the original computation.
* Fixing bug that prevented updating the source file key in the compute
  state
* Handling --reproducible and --unreproducible.
* recompute --original of a file using VURL, when the result is
  different, but the key remains the same, makes the object file
  be updated with the new content
* Detecting some other ways the program behavior can change, just for
  completeness.
* Also adds --backend to addcomputed.
This commit is contained in:
Joey Hess 2025-02-27 15:12:29 -04:00
parent 1704b5e327
commit e6ae5e8d56
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 127 additions and 54 deletions

View file

@ -28,7 +28,7 @@ import qualified Data.Map as M
import Data.Time.Clock import Data.Time.Clock
cmd :: Command cmd :: Command
cmd = notBareRepo $ cmd = notBareRepo $ withAnnexOptions [backendOption] $
command "addcomputed" SectionCommon "add computed files to annex" command "addcomputed" SectionCommon "add computed files to annex"
(paramRepeating paramExpression) (paramRepeating paramExpression)
(seek <$$> optParser) (seek <$$> optParser)
@ -96,11 +96,22 @@ perform o r = do
Remote.Compute.runComputeProgram program state Remote.Compute.runComputeProgram program state
(Remote.Compute.ImmutableState False) (Remote.Compute.ImmutableState False)
(getInputContent fast) (getInputContent fast)
(addComputed "adding" True r (reproducible o) Just fast) (addComputed "adding" True r (reproducible o) chooseBackend Just fast)
next $ return True next $ return True
addComputed :: StringContainingQuotedPath -> Bool -> Remote -> Maybe Reproducible -> (OsPath -> Maybe OsPath) -> Bool -> Remote.Compute.ComputeState -> OsPath -> NominalDiffTime -> Annex () addComputed
addComputed addaction stagefiles r reproducibleconfig destfile fast state tmpdir ts = do :: StringContainingQuotedPath
-> Bool
-> Remote
-> Maybe Reproducible
-> (OsPath -> Annex Backend)
-> (OsPath -> Maybe OsPath)
-> Bool
-> Remote.Compute.ComputeState
-> OsPath
-> NominalDiffTime
-> Annex ()
addComputed addaction stagefiles r reproducibleconfig choosebackend destfile fast state tmpdir ts = do
let outputs = Remote.Compute.computeOutputs state let outputs = Remote.Compute.computeOutputs state
when (M.null outputs) $ when (M.null outputs) $
giveup "The computation succeeded, but it did not generate any files." giveup "The computation succeeded, but it did not generate any files."
@ -146,22 +157,24 @@ addComputed addaction stagefiles r reproducibleconfig destfile fast state tmpdir
, contentLocation = outputfile' , contentLocation = outputfile'
, inodeCache = Nothing , inodeCache = Nothing
} }
genkey f p = do
backend <- choosebackend outputfile
fst <$> genKey (ks f) p backend
makelink f k = void $ makeLink f k Nothing
ingesthelper f p mk
| stagefiles = ingestwith $ do
k <- maybe (genkey f p) return mk
ingestAdd' p (Just (ld f)) (Just k)
| otherwise = ingestwith $ do
k <- maybe (genkey f p) return mk
mk' <- fst <$> ingest p (Just (ld f)) (Just k)
maybe noop (makelink f) mk'
return mk'
ingestwith a = a >>= \case ingestwith a = a >>= \case
Nothing -> giveup "ingestion failed" Nothing -> giveup "ingestion failed"
Just k -> do Just k -> do
logStatus NoLiveUpdate k InfoPresent logStatus NoLiveUpdate k InfoPresent
return k return k
genkey f p = do
backend <- chooseBackend outputfile
fst <$> genKey (ks f) p backend
makelink f k = void $ makeLink f k Nothing
ingesthelper f p mk
| stagefiles = ingestwith $
ingestAdd' p (Just (ld f)) mk
| otherwise = ingestwith $ do
mk' <- fst <$> ingest p (Just (ld f)) mk
maybe noop (makelink f) mk'
return mk'
ldc = LockDownConfig ldc = LockDownConfig
{ lockingFile = True { lockingFile = True

View file

@ -14,10 +14,13 @@ import qualified Annex
import qualified Remote.Compute import qualified Remote.Compute
import qualified Remote import qualified Remote
import qualified Types.Remote as Remote import qualified Types.Remote as Remote
import Annex.Content
import Annex.CatFile import Annex.CatFile
import Git.FilePath import Git.FilePath
import Logs.Location import Logs.Location
import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent, getInputContent', addComputed) import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent, getInputContent', addComputed)
import Backend (maybeLookupBackendVariety, unknownBackendVarietyMessage)
import Types.Key
import qualified Data.Map as M import qualified Data.Map as M
@ -62,7 +65,7 @@ seek' o = do
start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart
start o (Just computeremote) si file key = start o (Just computeremote) si file key =
stopUnless (notElem (Remote.uuid computeremote) <$> loggedLocations key) $ stopUnless (elem (Remote.uuid computeremote) <$> loggedLocations key) $
start' o computeremote si file key start' o computeremote si file key
start o Nothing si file key = do start o Nothing si file key = do
rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key) rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key)
@ -103,31 +106,73 @@ start' o r si file key =
-- explains the problem. -- explains the problem.
Nothing -> True Nothing -> True
-- TODO When reproducible is not set, preserve the
-- reproducible/unreproducible of the input key.
perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform
perform o r file key origstate = do perform o r file origkey origstate = do
program <- Remote.Compute.getComputeProgram r program <- Remote.Compute.getComputeProgram r
fast <- Annex.getRead Annex.fast reproducibleconfig <- getreproducibleconfig
showOutput showOutput
Remote.Compute.runComputeProgram program origstate Remote.Compute.runComputeProgram program origstate
(Remote.Compute.ImmutableState True) (Remote.Compute.ImmutableState False)
(getinputcontent program fast) (getinputcontent program)
(addComputed "processing" False r (reproducible o) destfile fast) (go program reproducibleconfig)
next $ return True next $ return True
where where
getinputcontent program fast p go program reproducibleconfig state tmpdir ts = do
checkbehaviorchange program state
addComputed "processing" False r reproducibleconfig
choosebackend destfile state tmpdir ts
checkbehaviorchange program state = do
let check s w a b = forM_ (M.keys (w a)) $ \f ->
unless (M.member f (w b)) $
Remote.Compute.computationBehaviorChangeError program s f
check "not using input file"
Remote.Compute.computeInputs origstate state
check "outputting"
Remote.Compute.computeOutputs state origstate
check "not outputting"
Remote.Compute.computeOutputs origstate state
getinputcontent program p
| originalOption o = | originalOption o =
case M.lookup p (Remote.Compute.computeInputs origstate) of case M.lookup p (Remote.Compute.computeInputs origstate) of
Just inputkey -> getInputContent' fast inputkey Just inputkey -> getInputContent' False inputkey
(fromOsPath p ++ "(key " ++ serializeKey inputkey ++ ")") (fromOsPath p ++ "(key " ++ serializeKey inputkey ++ ")")
Nothing -> Remote.Compute.computationBehaviorChangeError program Nothing -> Remote.Compute.computationBehaviorChangeError program
"requesting a new input file" p "requesting a new input file" p
| otherwise = getInputContent fast p | otherwise = getInputContent False p
destfile outputfile destfile outputfile
| Just outputfile == origfile = Just file | Just outputfile == origfile = Just file
| otherwise = Nothing | otherwise = Nothing
origfile = headMaybe $ M.keys $ M.filter (== Just key) origfile = headMaybe $ M.keys $ M.filter (== Just origkey)
(Remote.Compute.computeOutputs origstate) (Remote.Compute.computeOutputs origstate)
origbackendvariety = fromKey keyVariety origkey
getreproducibleconfig = case reproducible o of
Just (Reproducible True) -> return (Just (Reproducible True))
-- A VURL key is used when the computation was
-- unreproducible. So recomputing should too, but that
-- will result in the same VURL key. Since moveAnnex
-- will prefer the current annex object to a new one,
-- delete the annex object first, so that if recomputing
-- generates a new version of the file, it replaces
-- the old version.
v -> case origbackendvariety of
VURLKey -> do
lockContentForRemoval origkey noop removeAnnex
-- in case computation fails or is interupted
logStatus NoLiveUpdate origkey InfoMissing
return (Just (Reproducible False))
_ -> return v
choosebackend _outputfile
-- Use the same backend as was used to compute it before,
-- so if the computed file is the same, there will be
-- no change.
| otherwise = maybeLookupBackendVariety origbackendvariety >>= \case
Just b -> return b
Nothing -> giveup $ unknownBackendVarietyMessage origbackendvariety

View file

@ -399,8 +399,8 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
liftIO $ hPutStrLn (stdinHandle p) $ liftIO $ hPutStrLn (stdinHandle p) $
maybe "" fromOsPath mp' maybe "" fromOsPath mp'
liftIO $ hFlush (stdinHandle p) liftIO $ hFlush (stdinHandle p)
return $ if knowninput return $ if immutablestate
then state' then state
else state' else state'
{ computeInputs = { computeInputs =
M.insert f' k M.insert f' k
@ -411,8 +411,8 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
checksafefile tmpdir subdir f' "output" checksafefile tmpdir subdir f' "output"
let knownoutput = M.member f' (computeOutputs state') let knownoutput = M.member f' (computeOutputs state')
checkimmutable knownoutput "outputting" f' $ checkimmutable knownoutput "outputting" f' $
return $ if knownoutput return $ if immutablestate
then state' then state
else state' else state'
{ computeOutputs = { computeOutputs =
M.insert f' Nothing M.insert f' Nothing

View file

@ -1,3 +1,20 @@
* VURL keys don't currently have the hash key recorded in the equivilant
key log by addcompute or when getting from a compute remote.
* need progress bars for computations and implement PROGRESS message
* get input files for a computation (so `git-annex get .` gets every file,
even when input files in a directory are processed after computed files)
* autoinit security
* Support non-annexed files as inputs to computations.
* addcomputed should honor annex.addunlocked.
* Perhaps recompute should write a new version of a file as an unlocked
file when the file is currently unlocked?
* recompute could ingest keys for other files than the one being * recompute could ingest keys for other files than the one being
recomputed, and remember them. Then recomputing those files could just recomputed, and remember them. Then recomputing those files could just
use those keys, without re-running a computation. (Better than --others use those keys, without re-running a computation. (Better than --others
@ -18,18 +35,6 @@
Or it could build a DAG and traverse it, but building a DAG of a large Or it could build a DAG and traverse it, but building a DAG of a large
directory tree has its own problems. directory tree has its own problems.
* recompute should use the same key backend for a file that it used before
(except when --reproducible/--unreproducible is passed).
* Check recompute's handling of --reproducible and --unreproducible.
* addcomputed should honor annex.addunlocked.
* Perhaps recompute should write a new version of a file as an unlocked
file when the file is currently unlocked?
* Support non-annexed files as inputs to computations.
* Should addcomputed honor annex.smallfiles? That would seem to imply * Should addcomputed honor annex.smallfiles? That would seem to imply
that recompute should also support recomputing non-annexed files. that recompute should also support recomputing non-annexed files.
Otherwise, adding a file and then recomputing it would vary in Otherwise, adding a file and then recomputing it would vary in

View file

@ -82,6 +82,10 @@ the parameters provided to `git-annex addcomputed`.
checksum verification error. One thing that can be done then is to use checksum verification error. One thing that can be done then is to use
`git-annex recompute --original --unreproducible`. `git-annex recompute --original --unreproducible`.
* `--backend`
Specifies which key-value backend to use.
* Also the [[git-annex-common-options]](1) can be used. * Also the [[git-annex-common-options]](1) can be used.
# SEE ALSO # SEE ALSO

View file

@ -21,17 +21,7 @@ updated with the new content.
* `--original` * `--original`
Use the original content of input files. Re-run the computation with the original input files.
* `--unreproducible`, `-u`
Convert files that were added with `git-annex addcomputed --reproducible`
to be as if they were added without that option.
* `--reproducible`, `-r`
Convert files that were added with `git-annex addcomputed --unreproducible`
to be as if they were added with `--reproducible`.
* `--remote=name` * `--remote=name`
@ -42,6 +32,22 @@ updated with the new content.
a file can be computed by multiple remotes, the one with the lowest a file can be computed by multiple remotes, the one with the lowest
configured cost will be used. configured cost will be used.
* `--unreproducible`, `-u`
Indicate that the computation is not expected to be fully reproducible.
It can vary, in ways that produce files that equivilant enough to
be interchangeable, but are not necessarily identical.
This is the default unless the compute remote indicates that it produces
reproducible output.
* `--reproducible`, `-r`
Indicate that the computation is expected to be fully reproducible.
This is the default when the compute remote indicates that it produces
reproducible output.
* matching options * matching options
The [[git-annex-matching-options]](1) can be used to control what The [[git-annex-matching-options]](1) can be used to control what