support git files as input to computations

Using GIT keys, like are used when exporting git files to special
remotes. Except here the GIT key refers to a file checked into the git
repo.

Note that, since the compute remote uses catObject to get the content,
a symlink that is checked into git does not get followed. This is important
for security, because following a symlink and adding the content to the
repo as an annex object would allow exfiltrating content from outside
the repository.

Instead, the behavior with a symlink is to run the computation on the
symlink target. This may turn out to be confusing, and it might be worth
addcomputed checking if the file in git is a symlink and erroring out.
Or it could follow symlinks as long as the destination is a file in the
repisitory.
This commit is contained in:
Joey Hess 2025-03-03 11:59:04 -04:00
parent 6ebab7fb00
commit a0d6a6ea2a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 56 additions and 31 deletions

View file

@ -11,6 +11,8 @@ module Command.AddComputed where
import Command import Command
import qualified Git import qualified Git
import qualified Git.Types as Git
import qualified Git.Ref as Git
import qualified Annex import qualified Annex
import qualified Remote.Compute import qualified Remote.Compute
import qualified Types.Remote as Remote import qualified Types.Remote as Remote
@ -18,6 +20,7 @@ import Backend
import Annex.CatFile import Annex.CatFile
import Annex.Content.Presence import Annex.Content.Presence
import Annex.Ingest import Annex.Ingest
import Annex.GitShaKey
import Types.KeySource import Types.KeySource
import Types.Key import Types.Key
import Messages.Progress import Messages.Progress
@ -192,20 +195,31 @@ addComputed addaction stagefiles r reproducibleconfig choosebackend destfile fas
Just v -> isReproducible v Just v -> isReproducible v
Nothing -> Remote.Compute.computeReproducible state Nothing -> Remote.Compute.computeReproducible state
getInputContent :: Bool -> OsPath -> Annex (Key, Maybe OsPath) getInputContent :: Bool -> OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath))
getInputContent fast p = catKeyFile p >>= \case getInputContent fast p = catKeyFile p >>= \case
Just inputkey -> getInputContent' fast inputkey (fromOsPath p) Just inputkey -> getInputContent' fast inputkey filedesc
Nothing -> ifM (liftIO $ doesFileExist p) Nothing -> inRepo (Git.fileRef p) >>= \case
( giveup $ "The computation needs an input file that is not an annexed file: " ++ fromOsPath p Just fileref -> catObjectMetaData fileref >>= \case
, giveup $ "The computation needs an input file which does not exist: " ++ fromOsPath p Just (sha, _, t)
) | t == Git.BlobObject ->
getInputContent' fast (gitShaKey sha) filedesc
| otherwise ->
badinput $ ", not a git " ++ decodeBS (Git.fmtObjectType t)
Nothing -> notcheckedin
Nothing -> notcheckedin
where
filedesc = fromOsPath p
badinput s = giveup $ "The computation needs an input file " ++ s ++ ": " ++ fromOsPath p
notcheckedin = badinput "that is not checked into the git repository"
getInputContent' :: Bool -> Key -> String -> Annex (Key, Maybe OsPath) getInputContent' :: Bool -> Key -> String -> Annex (Key, Maybe (Either Git.Sha OsPath))
getInputContent' fast inputkey filedesc = do getInputContent' fast inputkey filedesc
obj <- calcRepo (gitAnnexLocation inputkey) | fast = return (inputkey, Nothing)
if fast | otherwise = case keyGitSha inputkey of
then return (inputkey, Nothing) Nothing -> ifM (inAnnex inputkey)
else ifM (inAnnex inputkey) ( do
( return (inputkey, Just obj) obj <- calcRepo (gitAnnexLocation inputkey)
, giveup $ "The computation needs the content of a file which is not present: " ++ filedesc return (inputkey, Just (Right obj))
, giveup $ "The computation needs the content of an annexed file which is not present: " ++ filedesc
) )
Just sha -> return (inputkey, Just (Left sha))

View file

@ -145,7 +145,7 @@ newtype RefDate = RefDate String
{- Types of objects that can be stored in git. -} {- Types of objects that can be stored in git. -}
data ObjectType = BlobObject | CommitObject | TreeObject data ObjectType = BlobObject | CommitObject | TreeObject
deriving (Show) deriving (Show, Eq)
readObjectType :: S.ByteString -> Maybe ObjectType readObjectType :: S.ByteString -> Maybe ObjectType
readObjectType "blob" = Just BlobObject readObjectType "blob" = Just BlobObject

View file

@ -35,6 +35,8 @@ import Annex.SpecialRemote.Config
import Annex.UUID import Annex.UUID
import Annex.Content import Annex.Content
import Annex.Tmp import Annex.Tmp
import Annex.GitShaKey
import Annex.CatFile
import Logs.MetaData import Logs.MetaData
import Logs.EquivilantKeys import Logs.EquivilantKeys
import Utility.Metered import Utility.Metered
@ -43,10 +45,11 @@ import Utility.Env
import Utility.Tmp.Dir import Utility.Tmp.Dir
import Utility.Url import Utility.Url
import Utility.MonotonicClock import Utility.MonotonicClock
import qualified Git
import qualified Utility.SimpleProtocol as Proto
import Types.Key import Types.Key
import Backend import Backend
import qualified Git
import qualified Utility.FileIO as F
import qualified Utility.SimpleProtocol as Proto
import Network.HTTP.Types.URI import Network.HTTP.Types.URI
import Data.Time.Clock import Data.Time.Clock
@ -341,7 +344,7 @@ runComputeProgram
:: ComputeProgram :: ComputeProgram
-> ComputeState -> ComputeState
-> ImmutableState -> ImmutableState
-> (OsPath -> Annex (Key, Maybe OsPath)) -> (OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath)))
-> (ComputeState -> OsPath -> NominalDiffTime -> Annex v) -> (ComputeState -> OsPath -> NominalDiffTime -> Annex v)
-> Annex v -> Annex v
runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) getinputcontent cont = runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) getinputcontent cont =
@ -395,12 +398,17 @@ runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate)
let knowninput = M.member f' (computeInputs state') let knowninput = M.member f' (computeInputs state')
checksafefile tmpdir subdir f' "input" checksafefile tmpdir subdir f' "input"
checkimmutable knowninput "inputting" f' $ do checkimmutable knowninput "inputting" f' $ do
(k, mp) <- getinputcontent f' (k, inputcontent) <- getinputcontent f'
mp' <- liftIO $ maybe (pure Nothing) mp <- case inputcontent of
(Just <$$> relPathDirToFile subdir) Nothing -> pure Nothing
mp Just (Right f'') -> liftIO $
Just <$> relPathDirToFile subdir f''
Just (Left gitsha) -> do
liftIO . F.writeFile (subdir </> f')
=<< catObject gitsha
return (Just f')
liftIO $ hPutStrLn (stdinHandle p) $ liftIO $ hPutStrLn (stdinHandle p) $
maybe "" fromOsPath mp' maybe "" fromOsPath mp
liftIO $ hFlush (stdinHandle p) liftIO $ hFlush (stdinHandle p)
return $ if immutablestate return $ if immutablestate
then state then state
@ -467,10 +475,13 @@ computeKey rs (ComputeProgram program) k _af dest p vc =
getinputcontent state f = getinputcontent state f =
case M.lookup (fromOsPath f) (computeInputs state) of case M.lookup (fromOsPath f) (computeInputs state) of
Just inputkey -> do Just inputkey -> case keyGitSha inputkey of
obj <- calcRepo (gitAnnexLocation inputkey) Nothing -> do
-- XXX get input object when not present obj <- calcRepo (gitAnnexLocation inputkey)
return (inputkey, Just obj) -- XXX get input object when not present
return (inputkey, Just (Right obj))
Just gitsha ->
return (inputkey, Just (Left gitsha))
Nothing -> error "internal" Nothing -> error "internal"
computeskey state = computeskey state =

View file

@ -5,13 +5,13 @@
* autoinit security * autoinit security
* Support non-annexed files as inputs to computations.
* addcomputed should honor annex.addunlocked. * addcomputed should honor annex.addunlocked.
* Perhaps recompute should write a new version of a file as an unlocked * Perhaps recompute should write a new version of a file as an unlocked
file when the file is currently unlocked? file when the file is currently unlocked?
* compute on files in submodules
* recompute could ingest keys for other files than the one being * recompute could ingest keys for other files than the one being
recomputed, and remember them. Then recomputing those files could just recomputed, and remember them. Then recomputing those files could just
use those keys, without re-running a computation. (Better than --others use those keys, without re-running a computation. (Better than --others

View file

@ -8,8 +8,8 @@ git annex addcomputed `--to=remote -- ...`
# DESCRIPTION # DESCRIPTION
Adds files to the annex that are computed from input files, Adds files to the annex that are computed from input files in the
using a compute special remote. repository, using a compute special remote.
Once a file has been added to a compute remote, commands Once a file has been added to a compute remote, commands
like `git-annex get` will use it to compute the content of the file. like `git-annex get` will use it to compute the content of the file.