git-annex/Command/Recompute.hs
Joey Hess a673fc7cfd
recompute: stage new version of file in git
When writing doc/tips/computing_annexed_files.mdwn, I noticed
that a recompute --reproducible followed by a drop and a re-get did not
actually test if the file could be reproducible computed again.

Turns out that get and drop both operate on staged files. If there is an
unstaged modification in the work tree, that's ignored. Somewhat
surprisingly, other commands like info do operate on staged files. So
behavior is inconsistent, and fairly surprising really, when there are
unstaged modifications to files.

Probably this is rarely noticed because `git-annex add` is used to add a
new version of a file, and then it's staged. Or `git mv` is used to move
a file, rather than `mv` of a file over top of an existing file. So it's
uncommon to have an unstaged annexed file in a worktree.

It might be worth making things more consistent, but that's out of scope
for what I'm working on currently.

Also, I anticipate that supporting unlocked files with recompute will
require it to stage changes anyway.

So, make recompute stage the new version of the file.

I considered having recompute refuse to overwrite an existing staged
file. After all, whatever version was staged before will get lost when
the new version is staged over top of it. But, that's no different than
`git-annex addcomputed` being run with the name of an existing staged
file. Or `git-annex add` being run with a new file content when there is
an existing staged file. Or, for that matter, `git add` being ran with a
new content when there is an existing staged file.
2025-03-12 13:42:00 -04:00

213 lines
7.1 KiB
Haskell

{- git-annex command
-
- Copyright 2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Command.Recompute where
import Command
import qualified Remote.Compute
import qualified Remote
import qualified Types.Remote as Remote
import qualified Git.Ref as Git
import Annex.Content
import Annex.UUID
import Annex.CatFile
import Annex.GitShaKey
import Git.FilePath
import Logs.Location
import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent, getInputContent', addComputed)
import Backend (maybeLookupBackendVariety, unknownBackendVarietyMessage, chooseBackend)
import Types.Key
import qualified Data.Map as M
cmd :: Command
cmd = notBareRepo $
command "recompute" SectionCommon "recompute computed files"
paramPaths (seek <$$> optParser)
data RecomputeOptions = RecomputeOptions
{ recomputeThese :: CmdParams
, originalOption :: Bool
, reproducible :: Maybe Reproducible
, computeRemote :: Maybe (DeferredParse Remote)
}
optParser :: CmdParamsDesc -> Parser RecomputeOptions
optParser desc = RecomputeOptions
<$> cmdParams desc
<*> switch
( long "original"
<> help "recompute using original content of input files"
)
<*> parseReproducible
<*> optional (mkParseRemoteOption <$> parseRemoteOption)
seek :: RecomputeOptions -> CommandSeek
seek o = startConcurrency commandStages (seek' o)
seek' :: RecomputeOptions -> CommandSeek
seek' o = do
computeremote <- maybe (pure Nothing) (Just <$$> getParsed)
(computeRemote o)
let seeker = AnnexedFileSeeker
{ startAction = const $ start o computeremote
, checkContentPresent = Nothing
, usesLocationLog = True
}
withFilesInGitAnnex ww seeker
=<< workTreeItems ww (recomputeThese o)
where
ww = WarnUnmatchLsFiles "recompute"
start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart
start o (Just computeremote) si file key =
stopUnless (elem (Remote.uuid computeremote) <$> loggedLocations key) $
start' o computeremote si file key
start o Nothing si file key = do
rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key)
case sortOn Remote.cost $ filter Remote.Compute.isComputeRemote rs of
[] -> stop
(r:_) -> start' o r si file key
start' :: RecomputeOptions -> Remote -> SeekInput -> OsPath -> Key -> CommandStart
start' o r si file key =
Remote.Compute.getComputeState
(Remote.remoteStateHandle r) key >>= \case
Nothing -> stop
Just state -> shouldrecompute state >>= \case
Nothing -> stop
Just mreason -> starting "recompute" ai si $ do
maybe noop showNote mreason
perform o r file key state
where
ai = mkActionItem (key, file)
shouldrecompute state
| originalOption o = return (Just Nothing)
| otherwise = firstM (inputchanged state)
(M.toList (Remote.Compute.computeInputs state))
>>= return . \case
Nothing -> Nothing
Just (inputfile, _) -> Just $ Just $
QuotedPath inputfile <> " changed"
inputchanged state (inputfile, inputkey) = do
-- Note that the paths from the remote state are not to be
-- trusted to point to a file in the repository, but using
-- the path with git cat-file will only succeed if it
-- is checked into the repository.
p <- fromRepo $ fromTopFilePath $ asTopFilePath $
Remote.Compute.computeSubdir state </> inputfile
case keyGitSha inputkey of
Nothing ->
catKeyFile p >>= return . \case
Just k -> k /= inputkey
Nothing -> inputfilemissing
Just inputgitsha -> inRepo (Git.fileRef p) >>= \case
Just fileref -> catObjectMetaData fileref >>= return . \case
Just (sha, _, _) -> sha /= inputgitsha
Nothing -> inputfilemissing
Nothing -> return inputfilemissing
where
-- When an input file is missing, go ahead and recompute.
-- This way, the user will see the computation fail,
-- with an error message that explains the problem.
-- Or, if the input file is only optionally used by the
-- computation, it might succeed.
inputfilemissing = True
perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform
perform o r file origkey origstate = do
program <- Remote.Compute.getComputeProgram r
reproducibleconfig <- getreproducibleconfig
showOutput
Remote.Compute.runComputeProgram program origstate
(Remote.Compute.ImmutableState False)
(getinputcontent program)
Nothing
(go program reproducibleconfig)
next cleanup
where
go program reproducibleconfig result tmpdir ts = do
checkbehaviorchange program
(Remote.Compute.computeState result)
addComputed Nothing r reproducibleconfig
choosebackend destfile False result tmpdir ts
checkbehaviorchange program state = do
let check s w a b = forM_ (M.keys (w a)) $ \f ->
unless (M.member f (w b)) $
Remote.Compute.computationBehaviorChangeError program s f
check "not using input file"
Remote.Compute.computeInputs origstate state
check "outputting"
Remote.Compute.computeOutputs state origstate
check "not outputting"
Remote.Compute.computeOutputs origstate state
getinputcontent program p required
| originalOption o =
case M.lookup p (Remote.Compute.computeInputs origstate) of
Just inputkey -> getInputContent' False inputkey required
(fromOsPath p ++ "(key " ++ serializeKey inputkey ++ ")")
Nothing -> Remote.Compute.computationBehaviorChangeError program
"requesting a new input file" p
| otherwise = getInputContent False p required
destfile outputfile
| Just outputfile == origfile = Just file
| otherwise = Nothing
origfile = headMaybe $ M.keys $ M.filter (== Just origkey)
(Remote.Compute.computeOutputs origstate)
origbackendvariety = fromKey keyVariety origkey
recomputingvurl = case origbackendvariety of
VURLKey -> True
_ -> False
getreproducibleconfig = case reproducible o of
Just (Reproducible True) -> return (Just (Reproducible True))
-- A VURL key is used when the computation was
-- unreproducible. So recomputing should too, but that
-- will result in the same VURL key. Since moveAnnex
-- will prefer the current annex object to a new one,
-- delete the annex object first, so that if recomputing
-- generates a new version of the file, it replaces
-- the old version.
v -> if recomputingvurl
then do
lockContentForRemoval origkey noop removeAnnex
return (Just (Reproducible False))
else return v
cleanup = do
case reproducible o of
Just (Reproducible True) -> noop
-- in case computation failed, update
-- location log for removal done earlier
_ -> when recomputingvurl $ do
u <- getUUID
unlessM (elem u <$> loggedLocations origkey) $
logStatus NoLiveUpdate origkey InfoMissing
return True
choosebackend outputfile
-- When converting a VURL to reproducible, can't use
-- the VURL backend.
| recomputingvurl && reproducible o == Just (Reproducible True) =
chooseBackend outputfile
-- Use the same backend as was used to compute it before,
-- so if the computed file is the same, there will be
-- no change.
| otherwise = maybeLookupBackendVariety origbackendvariety >>= \case
Just b -> return b
Nothing -> giveup $ unknownBackendVarietyMessage origbackendvariety