many recompute improvements

I've lost track of them all, but it includes:

* Using the same key backend as was used in the original computation.
* Fixing bug that prevented updating the source file key in the compute
  state
* Handling --reproducible and --unreproducible.
* recompute --original of a file using VURL, when the result is
  different, but the key remains the same, makes the object file
  be updated with the new content
* Detecting some other ways the program behavior can change, just for
  completeness.
* Also adds --backend to addcomputed.
This commit is contained in:
Joey Hess 2025-02-27 15:12:29 -04:00
parent 1704b5e327
commit e6ae5e8d56
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 127 additions and 54 deletions

View file

@ -28,7 +28,7 @@ import qualified Data.Map as M
import Data.Time.Clock
cmd :: Command
cmd = notBareRepo $
cmd = notBareRepo $ withAnnexOptions [backendOption] $
command "addcomputed" SectionCommon "add computed files to annex"
(paramRepeating paramExpression)
(seek <$$> optParser)
@ -96,11 +96,22 @@ perform o r = do
Remote.Compute.runComputeProgram program state
(Remote.Compute.ImmutableState False)
(getInputContent fast)
(addComputed "adding" True r (reproducible o) Just fast)
(addComputed "adding" True r (reproducible o) chooseBackend Just fast)
next $ return True
addComputed :: StringContainingQuotedPath -> Bool -> Remote -> Maybe Reproducible -> (OsPath -> Maybe OsPath) -> Bool -> Remote.Compute.ComputeState -> OsPath -> NominalDiffTime -> Annex ()
addComputed addaction stagefiles r reproducibleconfig destfile fast state tmpdir ts = do
addComputed
:: StringContainingQuotedPath
-> Bool
-> Remote
-> Maybe Reproducible
-> (OsPath -> Annex Backend)
-> (OsPath -> Maybe OsPath)
-> Bool
-> Remote.Compute.ComputeState
-> OsPath
-> NominalDiffTime
-> Annex ()
addComputed addaction stagefiles r reproducibleconfig choosebackend destfile fast state tmpdir ts = do
let outputs = Remote.Compute.computeOutputs state
when (M.null outputs) $
giveup "The computation succeeded, but it did not generate any files."
@ -146,22 +157,24 @@ addComputed addaction stagefiles r reproducibleconfig destfile fast state tmpdir
, contentLocation = outputfile'
, inodeCache = Nothing
}
genkey f p = do
backend <- choosebackend outputfile
fst <$> genKey (ks f) p backend
makelink f k = void $ makeLink f k Nothing
ingesthelper f p mk
| stagefiles = ingestwith $ do
k <- maybe (genkey f p) return mk
ingestAdd' p (Just (ld f)) (Just k)
| otherwise = ingestwith $ do
k <- maybe (genkey f p) return mk
mk' <- fst <$> ingest p (Just (ld f)) (Just k)
maybe noop (makelink f) mk'
return mk'
ingestwith a = a >>= \case
Nothing -> giveup "ingestion failed"
Just k -> do
logStatus NoLiveUpdate k InfoPresent
return k
genkey f p = do
backend <- chooseBackend outputfile
fst <$> genKey (ks f) p backend
makelink f k = void $ makeLink f k Nothing
ingesthelper f p mk
| stagefiles = ingestwith $
ingestAdd' p (Just (ld f)) mk
| otherwise = ingestwith $ do
mk' <- fst <$> ingest p (Just (ld f)) mk
maybe noop (makelink f) mk'
return mk'
ldc = LockDownConfig
{ lockingFile = True

View file

@ -14,10 +14,13 @@ import qualified Annex
import qualified Remote.Compute
import qualified Remote
import qualified Types.Remote as Remote
import Annex.Content
import Annex.CatFile
import Git.FilePath
import Logs.Location
import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent, getInputContent', addComputed)
import Backend (maybeLookupBackendVariety, unknownBackendVarietyMessage)
import Types.Key
import qualified Data.Map as M
@ -62,7 +65,7 @@ seek' o = do
start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart
start o (Just computeremote) si file key =
stopUnless (notElem (Remote.uuid computeremote) <$> loggedLocations key) $
stopUnless (elem (Remote.uuid computeremote) <$> loggedLocations key) $
start' o computeremote si file key
start o Nothing si file key = do
rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key)
@ -103,31 +106,73 @@ start' o r si file key =
-- explains the problem.
Nothing -> True
-- TODO When reproducible is not set, preserve the
-- reproducible/unreproducible of the input key.
perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform
perform o r file key origstate = do
perform o r file origkey origstate = do
program <- Remote.Compute.getComputeProgram r
fast <- Annex.getRead Annex.fast
reproducibleconfig <- getreproducibleconfig
showOutput
Remote.Compute.runComputeProgram program origstate
(Remote.Compute.ImmutableState True)
(getinputcontent program fast)
(addComputed "processing" False r (reproducible o) destfile fast)
(Remote.Compute.ImmutableState False)
(getinputcontent program)
(go program reproducibleconfig)
next $ return True
where
getinputcontent program fast p
go program reproducibleconfig state tmpdir ts = do
checkbehaviorchange program state
addComputed "processing" False r reproducibleconfig
choosebackend destfile state tmpdir ts
checkbehaviorchange program state = do
let check s w a b = forM_ (M.keys (w a)) $ \f ->
unless (M.member f (w b)) $
Remote.Compute.computationBehaviorChangeError program s f
check "not using input file"
Remote.Compute.computeInputs origstate state
check "outputting"
Remote.Compute.computeOutputs state origstate
check "not outputting"
Remote.Compute.computeOutputs origstate state
getinputcontent program p
| originalOption o =
case M.lookup p (Remote.Compute.computeInputs origstate) of
Just inputkey -> getInputContent' fast inputkey
Just inputkey -> getInputContent' False inputkey
(fromOsPath p ++ "(key " ++ serializeKey inputkey ++ ")")
Nothing -> Remote.Compute.computationBehaviorChangeError program
"requesting a new input file" p
| otherwise = getInputContent fast p
| otherwise = getInputContent False p
destfile outputfile
| Just outputfile == origfile = Just file
| otherwise = Nothing
origfile = headMaybe $ M.keys $ M.filter (== Just key)
origfile = headMaybe $ M.keys $ M.filter (== Just origkey)
(Remote.Compute.computeOutputs origstate)
origbackendvariety = fromKey keyVariety origkey
getreproducibleconfig = case reproducible o of
Just (Reproducible True) -> return (Just (Reproducible True))
-- A VURL key is used when the computation was
-- unreproducible. So recomputing should too, but that
-- will result in the same VURL key. Since moveAnnex
-- will prefer the current annex object to a new one,
-- delete the annex object first, so that if recomputing
-- generates a new version of the file, it replaces
-- the old version.
v -> case origbackendvariety of
VURLKey -> do
lockContentForRemoval origkey noop removeAnnex
-- in case computation fails or is interupted
logStatus NoLiveUpdate origkey InfoMissing
return (Just (Reproducible False))
_ -> return v
choosebackend _outputfile
-- Use the same backend as was used to compute it before,
-- so if the computed file is the same, there will be
-- no change.
| otherwise = maybeLookupBackendVariety origbackendvariety >>= \case
Just b -> return b
Nothing -> giveup $ unknownBackendVarietyMessage origbackendvariety

View file

@ -399,8 +399,8 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
liftIO $ hPutStrLn (stdinHandle p) $
maybe "" fromOsPath mp'
liftIO $ hFlush (stdinHandle p)
return $ if knowninput
then state'
return $ if immutablestate
then state
else state'
{ computeInputs =
M.insert f' k
@ -411,8 +411,8 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
checksafefile tmpdir subdir f' "output"
let knownoutput = M.member f' (computeOutputs state')
checkimmutable knownoutput "outputting" f' $
return $ if knownoutput
then state'
return $ if immutablestate
then state
else state'
{ computeOutputs =
M.insert f' Nothing

View file

@ -1,3 +1,20 @@
* VURL keys don't currently have the hash key recorded in the equivilant
key log by addcompute or when getting from a compute remote.
* need progress bars for computations and implement PROGRESS message
* get input files for a computation (so `git-annex get .` gets every file,
even when input files in a directory are processed after computed files)
* autoinit security
* Support non-annexed files as inputs to computations.
* addcomputed should honor annex.addunlocked.
* Perhaps recompute should write a new version of a file as an unlocked
file when the file is currently unlocked?
* recompute could ingest keys for other files than the one being
recomputed, and remember them. Then recomputing those files could just
use those keys, without re-running a computation. (Better than --others
@ -18,18 +35,6 @@
Or it could build a DAG and traverse it, but building a DAG of a large
directory tree has its own problems.
* recompute should use the same key backend for a file that it used before
(except when --reproducible/--unreproducible is passed).
* Check recompute's handling of --reproducible and --unreproducible.
* addcomputed should honor annex.addunlocked.
* Perhaps recompute should write a new version of a file as an unlocked
file when the file is currently unlocked?
* Support non-annexed files as inputs to computations.
* Should addcomputed honor annex.smallfiles? That would seem to imply
that recompute should also support recomputing non-annexed files.
Otherwise, adding a file and then recomputing it would vary in

View file

@ -82,6 +82,10 @@ the parameters provided to `git-annex addcomputed`.
checksum verification error. One thing that can be done then is to use
`git-annex recompute --original --unreproducible`.
* `--backend`
Specifies which key-value backend to use.
* Also the [[git-annex-common-options]](1) can be used.
# SEE ALSO

View file

@ -21,17 +21,7 @@ updated with the new content.
* `--original`
Use the original content of input files.
* `--unreproducible`, `-u`
Convert files that were added with `git-annex addcomputed --reproducible`
to be as if they were added without that option.
* `--reproducible`, `-r`
Convert files that were added with `git-annex addcomputed --unreproducible`
to be as if they were added with `--reproducible`.
Re-run the computation with the original input files.
* `--remote=name`
@ -42,6 +32,22 @@ updated with the new content.
a file can be computed by multiple remotes, the one with the lowest
configured cost will be used.
* `--unreproducible`, `-u`
Indicate that the computation is not expected to be fully reproducible.
It can vary, in ways that produce files that equivilant enough to
be interchangeable, but are not necessarily identical.
This is the default unless the compute remote indicates that it produces
reproducible output.
* `--reproducible`, `-r`
Indicate that the computation is expected to be fully reproducible.
This is the default when the compute remote indicates that it produces
reproducible output.
* matching options
The [[git-annex-matching-options]](1) can be used to control what