diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index 71d9f2e51f..8dc64f8b7b 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -134,6 +134,7 @@ import qualified Command.UpdateProxy import qualified Command.MaxSize import qualified Command.Sim import qualified Command.AddComputed +import qualified Command.Recompute import qualified Command.Version import qualified Command.RemoteDaemon #ifdef WITH_ASSISTANT @@ -267,6 +268,7 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexCommonOption , Command.MaxSize.cmd , Command.Sim.cmd , Command.AddComputed.cmd + , Command.Recompute.cmd , Command.Version.cmd , Command.RemoteDaemon.cmd #ifdef WITH_ASSISTANT diff --git a/Command/AddComputed.hs b/Command/AddComputed.hs index fad9c1dc30..9ff13f1f70 100644 --- a/Command/AddComputed.hs +++ b/Command/AddComputed.hs @@ -17,7 +17,6 @@ import qualified Types.Remote as Remote import Annex.CatFile import Annex.Content.Presence import Annex.Ingest -import Types.RemoteConfig import Types.KeySource import Messages.Progress import Logs.Location @@ -68,23 +67,20 @@ seek o = startConcurrency commandStages (seek' o) seek' :: AddComputedOptions -> CommandSeek seek' o = do r <- getParsed (computeRemote o) - unless (Remote.typename (Remote.remotetype r) == Remote.typename Remote.Compute.remote) $ + unless (Remote.Compute.isComputeRemote r) $ giveup "That is not a compute remote." - let rc = unparsedRemoteConfig (Remote.config r) - case Remote.Compute.getComputeProgram rc of - Left err -> giveup $ - "Problem with the configuration of the compute remote: " ++ err - Right program -> commandAction $ start o r program + commandAction $ start o r -start :: AddComputedOptions -> Remote -> Remote.Compute.ComputeProgram -> CommandStart -start o r program = starting "addcomputed" ai si $ perform o r program +start :: AddComputedOptions -> Remote -> CommandStart +start o r = starting "addcomputed" ai si $ perform o r where ai = ActionItemUUID (Remote.uuid r) (UnquotedString (Remote.name r)) si = SeekInput (computeParams o) -perform :: AddComputedOptions -> Remote -> Remote.Compute.ComputeProgram -> CommandPerform -perform o r program = do +perform :: AddComputedOptions -> Remote -> CommandPerform +perform o r = do + program <- Remote.Compute.getComputeProgram r repopath <- fromRepo Git.repoPath subdir <- liftIO $ relPathDirToFile repopath (literalOsPath ".") let state = Remote.Compute.ComputeState @@ -100,24 +96,10 @@ perform o r program = do showOutput Remote.Compute.runComputeProgram program state (Remote.Compute.ImmutableState False) - (getinputcontent fast) + (getInputContent fast) (go starttime fast) next $ return True where - getinputcontent fast p = catKeyFile p >>= \case - Just inputkey -> do - obj <- calcRepo (gitAnnexLocation inputkey) - if fast - then return (inputkey, Nothing) - else ifM (inAnnex inputkey) - ( return (inputkey, Just obj) - , giveup $ "The computation needs the content of a file which is not present: " ++ fromOsPath p - ) - Nothing -> ifM (liftIO $ doesFileExist p) - ( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p - , giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p - ) - go starttime fast state tmpdir = do endtime <- liftIO currentMonotonicTimestamp let ts = calcduration starttime endtime @@ -175,3 +157,18 @@ perform o r program = do isreproducible state = case reproducible o of Just v -> isReproducible v Nothing -> Remote.Compute.computeReproducible state + +getInputContent :: Bool -> OsPath -> Annex (Key, Maybe OsPath) +getInputContent fast p = catKeyFile p >>= \case + Just inputkey -> do + obj <- calcRepo (gitAnnexLocation inputkey) + if fast + then return (inputkey, Nothing) + else ifM (inAnnex inputkey) + ( return (inputkey, Just obj) + , giveup $ "The computation needs the content of a file which is not present: " ++ fromOsPath p + ) + Nothing -> ifM (liftIO $ doesFileExist p) + ( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p + , giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p + ) diff --git a/Command/Recompute.hs b/Command/Recompute.hs new file mode 100644 index 0000000000..95f8f3e16f --- /dev/null +++ b/Command/Recompute.hs @@ -0,0 +1,202 @@ +{- git-annex command + - + - Copyright 2025 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings #-} + +module Command.Recompute where + +import Command +import qualified Git +import qualified Annex +import qualified Remote.Compute +import qualified Remote +import qualified Types.Remote as Remote +import Annex.CatFile +import Annex.Content.Presence +import Annex.Ingest +import Git.FilePath +import Types.RemoteConfig +import Types.KeySource +import Messages.Progress +import Logs.Location +import Utility.Metered +import Utility.MonotonicClock +import Backend.URL (fromUrl) +import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent) + +import qualified Data.Map as M +import Data.Time.Clock + +cmd :: Command +cmd = notBareRepo $ + command "recompute" SectionCommon "recompute computed files" + paramPaths (seek <$$> optParser) + +data RecomputeOptions = RecomputeOptions + { recomputeThese :: CmdParams + , originalOption :: Bool + , othersOption :: Bool + , reproducible :: Maybe Reproducible + , computeRemote :: Maybe (DeferredParse Remote) + } + +optParser :: CmdParamsDesc -> Parser RecomputeOptions +optParser desc = RecomputeOptions + <$> cmdParams desc + <*> switch + ( long "original" + <> help "recompute using original content of input files" + ) + <*> switch + ( long "others" + <> help "stage other files that are recomputed in passing" + ) + <*> parseReproducible + <*> optional (mkParseRemoteOption <$> parseRemoteOption) + +seek :: RecomputeOptions -> CommandSeek +seek o = startConcurrency commandStages (seek' o) + +seek' :: RecomputeOptions -> CommandSeek +seek' o = do + computeremote <- maybe (pure Nothing) (Just <$$> getParsed) + (computeRemote o) + let seeker = AnnexedFileSeeker + { startAction = const $ start o computeremote + , checkContentPresent = Nothing + , usesLocationLog = True + } + withFilesInGitAnnex ww seeker + =<< workTreeItems ww (recomputeThese o) + where + ww = WarnUnmatchLsFiles "recompute" + +start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart +start o (Just computeremote) si file key = + stopUnless (notElem (Remote.uuid computeremote) <$> loggedLocations key) $ + start' o computeremote si file key +start o Nothing si file key = do + rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key) + case sortOn Remote.cost $ filter Remote.Compute.isComputeRemote rs of + [] -> stop + (r:_) -> start' o r si file key + +start' :: RecomputeOptions -> Remote -> SeekInput -> OsPath -> Key -> CommandStart +start' o r si file key = + Remote.Compute.getComputeState + (Remote.remoteStateHandle r) key >>= \case + Nothing -> stop + Just state -> + stopUnless (shouldrecompute state) $ + starting "recompute" ai si $ + perform o r file key state + where + ai = mkActionItem (key, file) + + shouldrecompute state + | originalOption o = return True + | otherwise = + anyM (inputchanged state) $ + M.toList (Remote.Compute.computeInputs state) + + inputchanged state (inputfile, inputkey) = do + -- Note that the paths from the remote state are not to be + -- trusted to point to a file in the repository, but using + -- the path with catKeyFile will only succeed if it + -- is checked into the repository. + p <- fromRepo $ fromTopFilePath $ asTopFilePath $ + Remote.Compute.computeSubdir state inputfile + catKeyFile p >>= return . \case + Just k -> k /= inputkey + -- When an input file is missing, go ahead and + -- recompute. This way, the user will see the + -- computation fail, with an error message that + -- explains the problem. + -- XXX check that this works well + Nothing -> True + +perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform +perform o r file key oldstate = do + program <- Remote.Compute.getComputeProgram r + let recomputestate = oldstate + { Remote.Compute.computeInputs = mempty + , Remote.Compute.computeOutputs = mempty + } + fast <- Annex.getRead Annex.fast + starttime <- liftIO currentMonotonicTimestamp + showOutput + Remote.Compute.runComputeProgram program recomputestate + (Remote.Compute.ImmutableState False) + (getinputcontent program fast) + (go starttime fast) + next $ return True + where + getinputcontent program fast p + | originalOption o = + case M.lookup p (Remote.Compute.computeInputs oldstate) of + Just inputkey -> return (inputkey, Nothing) + Nothing -> Remote.Compute.computationBehaviorChangeError program + "requesting a new input file" p + | otherwise = getInputContent fast p + + go starttime fast state tmpdir = do + endtime <- liftIO currentMonotonicTimestamp + let ts = calcduration starttime endtime + let outputs = Remote.Compute.computeOutputs state + when (M.null outputs) $ + giveup "The computation succeeded, but it did not generate any files." + oks <- forM (M.keys outputs) $ \outputfile -> do + showAction $ "adding " <> QuotedPath outputfile + k <- catchNonAsync (addfile fast state tmpdir outputfile) + (\err -> giveup $ "Failed to ingest output file " ++ fromOsPath outputfile ++ ": " ++ show err) + return (outputfile, Just k) + let state' = state + { Remote.Compute.computeOutputs = M.fromList oks + } + forM_ (mapMaybe snd oks) $ \k -> do + Remote.Compute.setComputeState + (Remote.remoteStateHandle r) + k ts state' + logChange NoLiveUpdate k (Remote.uuid r) InfoPresent + + addfile fast state tmpdir outputfile + | fast = do + addSymlink outputfile stateurlk Nothing + return stateurlk + | isreproducible state = do + sz <- liftIO $ getFileSize outputfile' + metered Nothing sz Nothing $ \_ p -> + ingestwith $ ingestAdd p (Just ld) + | otherwise = ingestwith $ + ingestAdd' nullMeterUpdate (Just ld) (Just stateurlk) + where + stateurl = Remote.Compute.computeStateUrl r state outputfile + stateurlk = fromUrl stateurl Nothing True + outputfile' = tmpdir outputfile + ld = LockedDown ldc $ KeySource + { keyFilename = outputfile + , contentLocation = outputfile' + , inodeCache = Nothing + } + ingestwith a = a >>= \case + Nothing -> giveup "key generation failed" + Just k -> do + logStatus NoLiveUpdate k InfoPresent + return k + + ldc = LockDownConfig + { lockingFile = True + , hardlinkFileTmpDir = Nothing + , checkWritePerms = True + } + + calcduration (MonotonicTimestamp starttime) (MonotonicTimestamp endtime) = + fromIntegral (endtime - starttime) :: NominalDiffTime + + isreproducible state = case reproducible o of + Just v -> isReproducible v + Nothing -> Remote.Compute.computeReproducible state diff --git a/Remote/Compute.hs b/Remote/Compute.hs index 09ab45687a..b412fc4df6 100644 --- a/Remote/Compute.hs +++ b/Remote/Compute.hs @@ -9,14 +9,16 @@ module Remote.Compute ( remote, + isComputeRemote, ComputeState(..), setComputeState, - getComputeStates, + getComputeState, computeStateUrl, ComputeProgram, getComputeProgram, runComputeProgram, ImmutableState(..), + computationBehaviorChangeError, defaultComputeParams, ) where @@ -63,8 +65,11 @@ remote = RemoteType , thirdPartyPopulated = False } +isComputeRemote :: Remote -> Bool +isComputeRemote r = typename (remotetype r) == typename remote + gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote) -gen r u rc gc rs = case getComputeProgram rc of +gen r u rc gc rs = case getComputeProgram' rc of Left _err -> return Nothing Right program -> do c <- parsedRemoteConfig remote rc @@ -107,7 +112,7 @@ gen r u rc gc rs = case getComputeProgram rc of setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) setupInstance _ mu _ c _ = do - ComputeProgram program <- either giveup return (getComputeProgram c) + ComputeProgram program <- either giveup return $ getComputeProgram' c unlessM (liftIO $ inSearchPath program) $ giveup $ "Cannot find " ++ program ++ " in PATH" u <- maybe (liftIO genUUID) return mu @@ -136,8 +141,15 @@ defaultComputeParams = map mk . M.toList . getRemoteConfigPassedThrough . config newtype ComputeProgram = ComputeProgram String deriving (Show) -getComputeProgram :: RemoteConfig -> Either String ComputeProgram -getComputeProgram c = case fromProposedAccepted <$> M.lookup programField c of +getComputeProgram :: Remote -> Annex ComputeProgram +getComputeProgram r = + case getComputeProgram' (unparsedRemoteConfig (config r)) of + Right program -> return program + Left err -> giveup $ + "Problem with the configuration of compute remote " ++ name r ++ ": " ++ err + +getComputeProgram' :: RemoteConfig -> Either String ComputeProgram +getComputeProgram' c = case fromProposedAccepted <$> M.lookup programField c of Just program | safetyPrefix `isPrefixOf` program -> Right (ComputeProgram program) @@ -285,8 +297,15 @@ setComputeState rs k ts st = addRemoteMetaData k rs $ MetaData $ M.singleton (mkMetaFieldUnchecked $ T.pack ('t':show (truncateResolution 1 ts))) (S.singleton (MetaValue (CurrentlySet True) (formatComputeState k st))) -getComputeStates :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)] -getComputeStates rs k = do +{- When multiple ComputeStates have been recorded for the same key, + - this returns one that is probably less expensive to compute, + - based on the original time it took to compute it. -} +getComputeState:: RemoteStateHandle -> Key -> Annex (Maybe ComputeState) +getComputeState rs k = headMaybe . map snd . sortOn fst + <$> getComputeStatesUnsorted rs k + +getComputeStatesUnsorted :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)] +getComputeStatesUnsorted rs k = do RemoteMetaData _ (MetaData m) <- getCurrentRemoteMetaData rs k return $ go [] (M.toList m) where @@ -369,7 +388,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) let f' = toOsPath f let knowninput = M.member f' (computeInputs state') checksafefile tmpdir subdir f' "input" - checkimmutable knowninput l $ do + checkimmutable knowninput "inputting" f' $ do (k, mp) <- getinputcontent f' mp' <- liftIO $ maybe (pure Nothing) (Just <$$> relPathDirToFile subdir) @@ -388,7 +407,7 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) let f' = toOsPath f checksafefile tmpdir subdir f' "output" let knownoutput = M.member f' (computeOutputs state') - checkimmutable knownoutput l $ + checkimmutable knownoutput "outputting" f' $ return $ if knownoutput then state' else state' @@ -412,25 +431,31 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) when (any (\p -> dropTrailingPathSeparator p == literalOsPath ".git") (splitPath f)) $ err "inside the .git directory" - checkimmutable True _ a = a - checkimmutable False l a + checkimmutable True _ _ a = a + checkimmutable False requestdesc p a | not immutablestate = a - | otherwise = giveup $ - program ++ " is not behaving the same way it used to, now outputting: \"" ++ l ++ "\"" + | otherwise = computationBehaviorChangeError (ComputeProgram program) requestdesc p + +computationBehaviorChangeError :: ComputeProgram -> String -> OsPath -> Annex a +computationBehaviorChangeError (ComputeProgram program) requestdesc p = + giveup $ program ++ " is not behaving the same way it used to, now " ++ requestdesc ++ ": " ++ fromOsPath p computeKey :: RemoteStateHandle -> ComputeProgram -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification -computeKey rs (ComputeProgram program) k af dest p vc = do - states <- map snd . sortOn fst -- least expensive probably - <$> getComputeStates rs k - case mapMaybe computeskey states of - ((keyfile, state):_) -> runComputeProgram - (ComputeProgram program) - state - (ImmutableState True) - (getinputcontent state) - (go keyfile) - [] -> giveup "Missing compute state" +computeKey rs (ComputeProgram program) k af dest p vc = + getComputeState rs k >>= \case + Just state -> + case computeskey state of + Just keyfile -> runComputeProgram + (ComputeProgram program) + state + (ImmutableState True) + (getinputcontent state) + (go keyfile) + Nothing -> missingstate + Nothing -> missingstate where + missingstate = giveup "Missing compute state" + getinputcontent state f = case M.lookup (fromOsPath f) (computeInputs state) of Just inputkey -> do @@ -441,7 +466,7 @@ computeKey rs (ComputeProgram program) k af dest p vc = do computeskey state = case M.keys $ M.filter (== Just k) (computeOutputs state) of - (keyfile : _) -> Just (keyfile, state) + (keyfile : _) -> Just keyfile [] -> Nothing go keyfile state tmpdir = do @@ -470,7 +495,7 @@ computeKey rs (ComputeProgram program) k af dest p vc = do -- Make sure that the compute state exists. checkKey :: RemoteStateHandle -> Key -> Annex Bool checkKey rs k = do - states <- getComputeStates rs k + states <- getComputeStatesUnsorted rs k if null states then giveup "Missing compute state" else return True diff --git a/doc/git-annex-addcomputed.mdwn b/doc/git-annex-addcomputed.mdwn index 245d4a04b0..58261da181 100644 --- a/doc/git-annex-addcomputed.mdwn +++ b/doc/git-annex-addcomputed.mdwn @@ -78,9 +78,9 @@ the parameters provided to `git-annex addcomputed`. reproducible output (except when using `--fast`). If a computation turns out not to be fully reproducible, then getting - the file from the compute remote will later fail with a checksum - verification error. One thing that can be done then is to use - `git-annex recompute --unreproducible`. + a computed file from the compute remote will later fail with a + checksum verification error. One thing that can be done then is to use + `git-annex recompute --original --unreproducible`. * Also the [[git-annex-common-options]](1) can be used. diff --git a/doc/git-annex-recompute.mdwn b/doc/git-annex-recompute.mdwn index 2800a74106..6e1a32f0d9 100644 --- a/doc/git-annex-recompute.mdwn +++ b/doc/git-annex-recompute.mdwn @@ -1,6 +1,6 @@ # NAME -git-annex recompute - update computed files +git-annex recompute - recompute computed files # SYNOPSIS @@ -9,18 +9,24 @@ git-annex recompute [path ...]` # DESCRIPTION This updates computed files that were added with -[[git-annex-addcomputed]](1). +[[git-annex-addcomputed]](1). + +When the output of the computation is different, the updated computed +file is staged in the repository. By default, this only recomputes files whose input files have changed. -The new contents of the input files are used to re-run the computation, -and when the output is different, the updated computed file is staged -in the repository. +The new contents of the input files are used to re-run the computation. # OPTIONS -* `--unchanged` +* `--original` - Recompute files even when their input files have not changed. + Use the original content of input files. + +* `--others` + + When recomputing one file also generates new versions of other files, + stage those other files in the repository too. * `--unreproducible`, `-u` @@ -32,14 +38,20 @@ in the repository. Convert files that were added with `git-annex addcomputed --unreproducible` to be as if they were added with `--reproducible`. +* `--remote=name` + + Only recompute files that were computed by this compute remote. + + When this option is not used, all computed files are recomputed using + whatever compute remote was originally used to add them. In cases where + a file can be computed by multiple remotes, the one with the lowest + configured cost will be used. + * matching options The [[git-annex-matching-options]](1) can be used to control what files to recompute. - For example, to only recompute files that are computed by the "photoconv" - compute remote, use `--in=photoconv` - * Also the [[git-annex-common-options]](1) can be used. # SEE ALSO diff --git a/git-annex.cabal b/git-annex.cabal index 5ed414a8dd..88203be956 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -728,6 +728,7 @@ Executable git-annex Command.Proxy Command.Pull Command.Push + Command.Recompute Command.ReKey Command.ReadPresentKey Command.RecvKey