From a0d6a6ea2a486081f2f3c561b0eb4757a17e9d96 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 3 Mar 2025 11:59:04 -0400 Subject: [PATCH] support git files as input to computations Using GIT keys, like are used when exporting git files to special remotes. Except here the GIT key refers to a file checked into the git repo. Note that, since the compute remote uses catObject to get the content, a symlink that is checked into git does not get followed. This is important for security, because following a symlink and adding the content to the repo as an annex object would allow exfiltrating content from outside the repository. Instead, the behavior with a symlink is to run the computation on the symlink target. This may turn out to be confusing, and it might be worth addcomputed checking if the file in git is a symlink and erroring out. Or it could follow symlinks as long as the destination is a file in the repisitory. --- Command/AddComputed.hs | 42 ++++++++++++++++++++++------------ Git/Types.hs | 2 +- Remote/Compute.hs | 35 ++++++++++++++++++---------- TODO-compute | 4 ++-- doc/git-annex-addcomputed.mdwn | 4 ++-- 5 files changed, 56 insertions(+), 31 deletions(-) diff --git a/Command/AddComputed.hs b/Command/AddComputed.hs index 857e495ad0..b0127b10ba 100644 --- a/Command/AddComputed.hs +++ b/Command/AddComputed.hs @@ -11,6 +11,8 @@ module Command.AddComputed where import Command import qualified Git +import qualified Git.Types as Git +import qualified Git.Ref as Git import qualified Annex import qualified Remote.Compute import qualified Types.Remote as Remote @@ -18,6 +20,7 @@ import Backend import Annex.CatFile import Annex.Content.Presence import Annex.Ingest +import Annex.GitShaKey import Types.KeySource import Types.Key import Messages.Progress @@ -192,20 +195,31 @@ addComputed addaction stagefiles r reproducibleconfig choosebackend destfile fas Just v -> isReproducible v Nothing -> Remote.Compute.computeReproducible state -getInputContent :: Bool -> OsPath -> Annex (Key, Maybe OsPath) +getInputContent :: Bool -> OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath)) getInputContent fast p = catKeyFile p >>= \case - Just inputkey -> getInputContent' fast inputkey (fromOsPath p) - Nothing -> ifM (liftIO $ doesFileExist p) - ( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p - , giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p - ) + Just inputkey -> getInputContent' fast inputkey filedesc + Nothing -> inRepo (Git.fileRef p) >>= \case + Just fileref -> catObjectMetaData fileref >>= \case + Just (sha, _, t) + | t == Git.BlobObject -> + getInputContent' fast (gitShaKey sha) filedesc + | otherwise -> + badinput $ ", not a git " ++ decodeBS (Git.fmtObjectType t) + Nothing -> notcheckedin + Nothing -> notcheckedin + where + filedesc = fromOsPath p + badinput s = giveup $ "The computation needs an input file " ++ s ++ ": " ++ fromOsPath p + notcheckedin = badinput "that is not checked into the git repository" -getInputContent' :: Bool -> Key -> String -> Annex (Key, Maybe OsPath) -getInputContent' fast inputkey filedesc = do - obj <- calcRepo (gitAnnexLocation inputkey) - if fast - then return (inputkey, Nothing) - else ifM (inAnnex inputkey) - ( return (inputkey, Just obj) - , giveup $ "The computation needs the content of a file which is not present: " ++ filedesc +getInputContent' :: Bool -> Key -> String -> Annex (Key, Maybe (Either Git.Sha OsPath)) +getInputContent' fast inputkey filedesc + | fast = return (inputkey, Nothing) + | otherwise = case keyGitSha inputkey of + Nothing -> ifM (inAnnex inputkey) + ( do + obj <- calcRepo (gitAnnexLocation inputkey) + return (inputkey, Just (Right obj)) + , giveup $ "The computation needs the content of an annexed file which is not present: " ++ filedesc ) + Just sha -> return (inputkey, Just (Left sha)) diff --git a/Git/Types.hs b/Git/Types.hs index a32d07d4f7..1ad145452b 100644 --- a/Git/Types.hs +++ b/Git/Types.hs @@ -145,7 +145,7 @@ newtype RefDate = RefDate String {- Types of objects that can be stored in git. -} data ObjectType = BlobObject | CommitObject | TreeObject - deriving (Show) + deriving (Show, Eq) readObjectType :: S.ByteString -> Maybe ObjectType readObjectType "blob" = Just BlobObject diff --git a/Remote/Compute.hs b/Remote/Compute.hs index eaef6d44fb..564ecbda70 100644 --- a/Remote/Compute.hs +++ b/Remote/Compute.hs @@ -35,6 +35,8 @@ import Annex.SpecialRemote.Config import Annex.UUID import Annex.Content import Annex.Tmp +import Annex.GitShaKey +import Annex.CatFile import Logs.MetaData import Logs.EquivilantKeys import Utility.Metered @@ -43,10 +45,11 @@ import Utility.Env import Utility.Tmp.Dir import Utility.Url import Utility.MonotonicClock -import qualified Git -import qualified Utility.SimpleProtocol as Proto import Types.Key import Backend +import qualified Git +import qualified Utility.FileIO as F +import qualified Utility.SimpleProtocol as Proto import Network.HTTP.Types.URI import Data.Time.Clock @@ -341,7 +344,7 @@ runComputeProgram :: ComputeProgram -> ComputeState -> ImmutableState - -> (OsPath -> Annex (Key, Maybe OsPath)) + -> (OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath))) -> (ComputeState -> OsPath -> NominalDiffTime -> Annex v) -> Annex v runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) getinputcontent cont = @@ -395,12 +398,17 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) let knowninput = M.member f' (computeInputs state') checksafefile tmpdir subdir f' "input" checkimmutable knowninput "inputting" f' $ do - (k, mp) <- getinputcontent f' - mp' <- liftIO $ maybe (pure Nothing) - (Just <$$> relPathDirToFile subdir) - mp + (k, inputcontent) <- getinputcontent f' + mp <- case inputcontent of + Nothing -> pure Nothing + Just (Right f'') -> liftIO $ + Just <$> relPathDirToFile subdir f'' + Just (Left gitsha) -> do + liftIO . F.writeFile (subdir f') + =<< catObject gitsha + return (Just f') liftIO $ hPutStrLn (stdinHandle p) $ - maybe "" fromOsPath mp' + maybe "" fromOsPath mp liftIO $ hFlush (stdinHandle p) return $ if immutablestate then state @@ -467,10 +475,13 @@ computeKey rs (ComputeProgram program) k _af dest p vc = getinputcontent state f = case M.lookup (fromOsPath f) (computeInputs state) of - Just inputkey -> do - obj <- calcRepo (gitAnnexLocation inputkey) - -- XXX get input object when not present - return (inputkey, Just obj) + Just inputkey -> case keyGitSha inputkey of + Nothing -> do + obj <- calcRepo (gitAnnexLocation inputkey) + -- XXX get input object when not present + return (inputkey, Just (Right obj)) + Just gitsha -> + return (inputkey, Just (Left gitsha)) Nothing -> error "internal" computeskey state = diff --git a/TODO-compute b/TODO-compute index dfa629ab8b..b3f67016a7 100644 --- a/TODO-compute +++ b/TODO-compute @@ -5,13 +5,13 @@ * autoinit security -* Support non-annexed files as inputs to computations. - * addcomputed should honor annex.addunlocked. * Perhaps recompute should write a new version of a file as an unlocked file when the file is currently unlocked? +* compute on files in submodules + * recompute could ingest keys for other files than the one being recomputed, and remember them. Then recomputing those files could just use those keys, without re-running a computation. (Better than --others diff --git a/doc/git-annex-addcomputed.mdwn b/doc/git-annex-addcomputed.mdwn index 3301381c66..faff1d96b6 100644 --- a/doc/git-annex-addcomputed.mdwn +++ b/doc/git-annex-addcomputed.mdwn @@ -8,8 +8,8 @@ git annex addcomputed `--to=remote -- ...` # DESCRIPTION -Adds files to the annex that are computed from input files, -using a compute special remote. +Adds files to the annex that are computed from input files in the +repository, using a compute special remote. Once a file has been added to a compute remote, commands like `git-annex get` will use it to compute the content of the file.