Merge branch 'compute'

This commit is contained in:
Joey Hess 2025-03-06 14:23:58 -04:00
commit 6f78341fbf
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
47 changed files with 1771 additions and 161 deletions

View file

@ -308,16 +308,12 @@ updateAdjustedBranch adj (AdjBranch currbranch) origbranch
-- adjustment is stable.
return True
{- Passed an action that, if it succeeds may get or drop the Key associated
- with the file. When the adjusted branch needs to be refreshed to reflect
{- Passed an action that, if it succeeds may get or drop a key.
- When the adjusted branch needs to be refreshed to reflect
- those changes, it's handled here.
-
- Note that the AssociatedFile must be verified by this to point to the
- Key. In some cases, the value was provided by the user and might not
- really be an associated file.
-}
adjustedBranchRefresh :: AssociatedFile -> Annex a -> Annex a
adjustedBranchRefresh _af a = do
adjustedBranchRefresh :: Annex a -> Annex a
adjustedBranchRefresh a = do
r <- a
go
return r

View file

@ -376,16 +376,16 @@ lockContentUsing contentlocker key fallback a = withContentLockFile key $ \mlock
{- Runs an action, passing it the temp file to get,
- and if the action succeeds, verifies the file matches
- the key and moves the file into the annex as a key's content. -}
getViaTmp :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> AssociatedFile -> Maybe FileSize -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool
getViaTmp rsp v key af sz action =
getViaTmp :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> Maybe FileSize -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool
getViaTmp rsp v key sz action =
checkDiskSpaceToGet key sz False $
getViaTmpFromDisk rsp v key af action
getViaTmpFromDisk rsp v key action
{- Like getViaTmp, but does not check that there is enough disk space
- for the incoming key. For use when the key content is already on disk
- and not being copied into place. -}
getViaTmpFromDisk :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> AssociatedFile -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool
getViaTmpFromDisk rsp v key af action = checkallowed $ do
getViaTmpFromDisk :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool
getViaTmpFromDisk rsp v key action = checkallowed $ do
tmpfile <- prepTmp key
resuming <- liftIO $ doesPathExist tmpfile
(ok, verification) <- action tmpfile
@ -400,7 +400,7 @@ getViaTmpFromDisk rsp v key af action = checkallowed $ do
else verification
if ok
then ifM (verifyKeyContentPostRetrieval rsp v verification' key tmpfile)
( pruneTmpWorkDirBefore tmpfile (moveAnnex key af)
( pruneTmpWorkDirBefore tmpfile (moveAnnex key)
, do
verificationOfContentFailed tmpfile
return False
@ -507,8 +507,8 @@ withTmp key action = do
- accepted into the repository. Will display a warning message in this
- case. May also throw exceptions in some cases.
-}
moveAnnex :: Key -> AssociatedFile -> OsPath -> Annex Bool
moveAnnex key af src = ifM (checkSecureHashes' key)
moveAnnex :: Key -> OsPath -> Annex Bool
moveAnnex key src = ifM (checkSecureHashes' key)
( do
#ifdef mingw32_HOST_OS
{- Windows prevents deletion of files that are not
@ -523,7 +523,7 @@ moveAnnex key af src = ifM (checkSecureHashes' key)
where
storeobject dest = ifM (liftIO $ doesPathExist dest)
( alreadyhave
, adjustedBranchRefresh af $ modifyContentDir dest $ do
, adjustedBranchRefresh $ modifyContentDir dest $ do
liftIO $ moveFile src dest
-- Freeze the object file now that it is in place.
-- Waiting until now to freeze it allows for freeze
@ -776,7 +776,7 @@ removeAnnex (ContentRemovalLock key) = withObjectLoc key $ \file ->
-- it's unmodified.
resetpointer file = unlessM (liftIO $ isSymbolicLink <$> R.getSymbolicLinkStatus (fromOsPath file)) $
ifM (isUnmodified key file)
( adjustedBranchRefresh (AssociatedFile (Just file)) $
( adjustedBranchRefresh $
depopulatePointerFile key file
-- Modified file, so leave it alone.
-- If it was a hard link to the annex object,

View file

@ -11,16 +11,13 @@ module Annex.Export where
import Annex
import Annex.CatFile
import Annex.GitShaKey
import Types
import Types.Key
import qualified Git
import qualified Types.Remote as Remote
import Git.Quote
import Messages
import Data.Maybe
import qualified Data.ByteString.Short as S (fromShort, toShort)
-- From a sha pointing to the content of a file to the key
-- to use to export it. When the file is annexed, it's the annexed key.
-- When the file is stored in git, it's a special type of key to indicate
@ -31,31 +28,6 @@ exportKey sha = mk <$> catKey sha
mk (Just k) = k
mk Nothing = gitShaKey sha
-- Encodes a git sha as a key. This is used to represent a non-annexed
-- file that is stored on a special remote, which necessarily needs a
-- key.
--
-- This is not the same as a SHA1 key, because the mapping needs to be
-- bijective, also because git may not always use SHA1, and because git
-- takes a SHA1 of the file size + content, while git-annex SHA1 keys
-- only checksum the content.
gitShaKey :: Git.Sha -> Key
gitShaKey (Git.Ref s) = mkKey $ \kd -> kd
{ keyName = S.toShort s
, keyVariety = OtherKey "GIT"
}
-- Reverse of gitShaKey
keyGitSha :: Key -> Maybe Git.Sha
keyGitSha k
| fromKey keyVariety k == OtherKey "GIT" =
Just (Git.Ref (S.fromShort (fromKey keyName k)))
| otherwise = Nothing
-- Is a key storing a git sha, and not used for an annexed file?
isGitShaKey :: Key -> Bool
isGitShaKey = isJust . keyGitSha
warnExportImportConflict :: Remote -> Annex ()
warnExportImportConflict r = do
isimport <- Remote.isImportSupported r

41
Annex/GitShaKey.hs Normal file
View file

@ -0,0 +1,41 @@
{- Encoding a git sha as a Key
-
- Copyright 2017-2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Annex.GitShaKey where
import Types
import Types.Key
import qualified Git
import Data.Maybe
import qualified Data.ByteString.Short as S (fromShort, toShort)
-- Encodes a git sha as a Key. This is used to represent a non-annexed
-- file. For example, when storing a git sha on a special remote.
--
-- This is not the same as a SHA1 key, because the mapping needs to be
-- bijective, also because git may not always use SHA1, and because git
-- takes a SHA1 of the file size + content, while git-annex SHA1 keys
-- only checksum the content.
gitShaKey :: Git.Sha -> Key
gitShaKey (Git.Ref s) = mkKey $ \kd -> kd
{ keyName = S.toShort s
, keyVariety = OtherKey "GIT"
}
-- Reverse of gitShaKey
keyGitSha :: Key -> Maybe Git.Sha
keyGitSha k
| fromKey keyVariety k == OtherKey "GIT" =
Just (Git.Ref (S.fromShort (fromKey keyName k)))
| otherwise = Nothing
-- Is a key storing a git sha, and not used for an annexed file?
isGitShaKey :: Key -> Bool
isGitShaKey = isJust . keyGitSha

View file

@ -38,12 +38,12 @@ import qualified Annex
import Annex.Link
import Annex.LockFile
import Annex.Content
import Annex.Export
import Annex.RemoteTrackingBranch
import Annex.HashObject
import Annex.Transfer
import Annex.CheckIgnore
import Annex.CatFile
import Annex.GitShaKey
import Annex.VectorClock
import Annex.SpecialRemote.Config
import Command
@ -863,7 +863,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
ia loc [cid] tmpfile
(Left k)
(combineMeterUpdate p' p)
ok <- moveAnnex k af tmpfile
ok <- moveAnnex k tmpfile
when ok $
logStatus NoLiveUpdate k InfoPresent
return (Just (k, ok))
@ -906,7 +906,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
p
case keyGitSha k of
Nothing -> do
ok <- moveAnnex k af tmpfile
ok <- moveAnnex k tmpfile
when ok $ do
recordcidkey cidmap cid k
logStatus NoLiveUpdate k InfoPresent

View file

@ -198,17 +198,11 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage =
| otherwise = gounlocked key mcache
golocked key mcache =
tryNonAsync (moveAnnex key naf (contentLocation source)) >>= \case
tryNonAsync (moveAnnex key (contentLocation source)) >>= \case
Right True -> success key mcache
Right False -> giveup "failed to add content to annex"
Left e -> restoreFile (keyFilename source) key e
-- moveAnnex uses the AssociatedFile provided to it to unlock
-- locked files when getting a file in an adjusted branch.
-- That case does not apply here, where we're adding an unlocked
-- file, so provide it nothing.
naf = AssociatedFile Nothing
gounlocked key (Just cache) = do
-- Remove temp directory hard link first because
-- linkToAnnex falls back to copying if a file
@ -377,7 +371,7 @@ addAnnexedFile matcher file key mtmp = ifM (addUnlocked matcher mi (isJust mtmp)
stagePointerFile file mode =<< hashPointerFile key
Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath file)
case mtmp of
Just tmp -> ifM (moveAnnex key af tmp)
Just tmp -> ifM (moveAnnex key tmp)
( linkunlocked mode >> return True
, writepointer mode >> return False
)
@ -388,11 +382,10 @@ addAnnexedFile matcher file key mtmp = ifM (addUnlocked matcher mi (isJust mtmp)
, do
addSymlink file key Nothing
case mtmp of
Just tmp -> moveAnnex key af tmp
Just tmp -> moveAnnex key tmp
Nothing -> return True
)
where
af = AssociatedFile (Just file)
mi = case mtmp of
Just tmp -> MatchingFile $ FileInfo
{ contentFile = tmp

View file

@ -78,7 +78,7 @@ download r key f d witness =
Just StallDetectionDisabled -> go Nothing
Just sd -> runTransferrer sd r key f d Download witness
where
go sd = getViaTmp (Remote.retrievalSecurityPolicy r) vc key f Nothing $ \dest ->
go sd = getViaTmp (Remote.retrievalSecurityPolicy r) vc key Nothing $ \dest ->
download' (Remote.uuid r) key f sd d (go' dest) witness
go' dest p = verifiedAction $
Remote.retrieveKeyFile r key f dest p vc

View file

@ -10,13 +10,14 @@
module Backend (
builtinList,
defaultBackend,
defaultHashBackend,
hashBackend,
genKey,
getBackend,
chooseBackend,
lookupBackendVariety,
lookupBuiltinBackendVariety,
maybeLookupBackendVariety,
unknownBackendVarietyMessage,
isStableKey,
isCryptographicallySecureKey,
isCryptographicallySecure,
@ -54,6 +55,15 @@ defaultBackend = maybe cache return =<< Annex.getState Annex.backend
valid name = not (null name)
lookupname = lookupBackendVariety . parseKeyVariety . encodeBS
{- A hashing backend. Takes git config into account, but
- guarantees the backend is cryptographically secure. -}
hashBackend :: Annex Backend
hashBackend = do
db <- defaultBackend
return $ if isCryptographicallySecure db
then db
else defaultHashBackend
{- Generates a key for a file. -}
genKey :: KeySource -> MeterUpdate -> Backend -> Annex (Key, Backend)
genKey source meterupdate b = case B.genKey b of

View file

@ -10,10 +10,8 @@ module Backend.VURL.Utilities where
import Annex.Common
import Types.Key
import Types.Backend
import Types.KeySource
import Logs.EquivilantKeys
import qualified Backend.Hash
import Utility.Metered
migrateFromURLToVURL :: Key -> Backend -> AssociatedFile -> Bool -> Annex (Maybe Key)
migrateFromURLToVURL oldkey newbackend _af inannex
@ -41,18 +39,3 @@ migrateFromVURLToURL oldkey newbackend _af _
(keyData oldkey)
{ keyVariety = URLKey }
| otherwise = return Nothing
-- The Backend must use a cryptographically secure hash.
generateEquivilantKey :: Backend -> OsPath -> Annex (Maybe Key)
generateEquivilantKey b f =
case genKey b of
Just genkey -> do
showSideAction (UnquotedString Backend.Hash.descChecksum)
Just <$> genkey source nullMeterUpdate
Nothing -> return Nothing
where
source = KeySource
{ keyFilename = mempty -- avoid adding any extension
, contentLocation = f
, inodeCache = Nothing
}

View file

@ -1,5 +1,9 @@
git-annex (10.20250116) UNRELEASED; urgency=medium
* Added the compute special remote.
* addcomputed: New command, adds a file that is generated by a compute
special remote.
* recompute: New command, recomputes computed files.
* Support help.autocorrect settings "prompt", "never", and "immediate".
* Allow setting remote.foo.annex-tracking-branch to a branch name
that contains "/", as long as it's not a remote tracking branch.

View file

@ -133,6 +133,8 @@ import qualified Command.ExtendCluster
import qualified Command.UpdateProxy
import qualified Command.MaxSize
import qualified Command.Sim
import qualified Command.AddComputed
import qualified Command.Recompute
import qualified Command.Version
import qualified Command.RemoteDaemon
#ifdef WITH_ASSISTANT
@ -265,6 +267,8 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexCommonOption
, Command.UpdateProxy.cmd
, Command.MaxSize.cmd
, Command.Sim.cmd
, Command.AddComputed.cmd
, Command.Recompute.cmd
, Command.Version.cmd
, Command.RemoteDaemon.cmd
#ifdef WITH_ASSISTANT

View file

@ -927,7 +927,7 @@ downloadGitBundle rmt k = getKeyExportLocations rmt k >>= \case
getexport loc = catchNonAsync (getexport' loc) (const (pure False))
getexport' loc =
getViaTmp rsp vc k (AssociatedFile Nothing) Nothing $ \tmp -> do
getViaTmp rsp vc k Nothing $ \tmp -> do
v <- Remote.retrieveExport (Remote.exportActions rmt)
k loc tmp nullMeterUpdate
return (True, v)
@ -986,7 +986,7 @@ generateGitBundle rmt bs manifest =
tmp nullMeterUpdate
if (bundlekey `notElem` inManifest manifest)
then do
unlessM (moveAnnex bundlekey (AssociatedFile Nothing) tmp) $
unlessM (moveAnnex bundlekey tmp) $
giveup "Unable to push"
return (bundlekey, uploadaction bundlekey)
else return (bundlekey, noop)

236
Command/AddComputed.hs Normal file
View file

@ -0,0 +1,236 @@
{- git-annex command
-
- Copyright 2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Command.AddComputed where
import Command
import qualified Git
import qualified Git.Types as Git
import qualified Git.Ref as Git
import qualified Annex
import qualified Remote.Compute
import qualified Types.Remote as Remote
import Backend
import Annex.CatFile
import Annex.Content.Presence
import Annex.Ingest
import Annex.UUID
import Annex.GitShaKey
import Types.KeySource
import Types.Key
import Messages.Progress
import Logs.Location
import Logs.EquivilantKeys
import Utility.Metered
import Backend.URL (fromUrl)
import qualified Data.Map as M
import Data.Time.Clock
cmd :: Command
cmd = notBareRepo $ withAnnexOptions [backendOption] $
command "addcomputed" SectionCommon "add computed files to annex"
(paramRepeating paramExpression)
(seek <$$> optParser)
data AddComputedOptions = AddComputedOptions
{ computeParams :: CmdParams
, computeRemote :: DeferredParse Remote
, reproducible :: Maybe Reproducible
}
optParser :: CmdParamsDesc -> Parser AddComputedOptions
optParser desc = AddComputedOptions
<$> cmdParams desc
<*> (mkParseRemoteOption <$> parseToOption)
<*> parseReproducible
newtype Reproducible = Reproducible { isReproducible :: Bool }
parseReproducible :: Parser (Maybe Reproducible)
parseReproducible = r <|> unr
where
r = flag Nothing (Just (Reproducible True))
( long "reproducible"
<> short 'r'
<> help "computation is fully reproducible"
)
unr = flag Nothing (Just (Reproducible False))
( long "unreproducible"
<> short 'u'
<> help "computation is not fully reproducible"
)
seek :: AddComputedOptions -> CommandSeek
seek o = startConcurrency commandStages (seek' o)
seek' :: AddComputedOptions -> CommandSeek
seek' o = do
r <- getParsed (computeRemote o)
unless (Remote.Compute.isComputeRemote r) $
giveup "That is not a compute remote."
commandAction $ start o r
start :: AddComputedOptions -> Remote -> CommandStart
start o r = starting "addcomputed" ai si $ perform o r
where
ai = ActionItemUUID (Remote.uuid r) (UnquotedString (Remote.name r))
si = SeekInput (computeParams o)
perform :: AddComputedOptions -> Remote -> CommandPerform
perform o r = do
program <- Remote.Compute.getComputeProgram r
repopath <- fromRepo Git.repoPath
subdir <- liftIO $ relPathDirToFile repopath (literalOsPath ".")
let state = Remote.Compute.ComputeState
{ Remote.Compute.computeParams = computeParams o ++
Remote.Compute.defaultComputeParams r
, Remote.Compute.computeInputs = mempty
, Remote.Compute.computeOutputs = mempty
, Remote.Compute.computeSubdir = subdir
}
fast <- Annex.getRead Annex.fast
Remote.Compute.runComputeProgram program state
(Remote.Compute.ImmutableState False)
(getInputContent fast)
Nothing
(addComputed (Just "adding") True r (reproducible o) chooseBackend Just fast)
next $ return True
addComputed
:: Maybe StringContainingQuotedPath
-> Bool
-> Remote
-> Maybe Reproducible
-> (OsPath -> Annex Backend)
-> (OsPath -> Maybe OsPath)
-> Bool
-> Remote.Compute.ComputeProgramResult
-> OsPath
-> NominalDiffTime
-> Annex ()
addComputed maddaction stagefiles r reproducibleconfig choosebackend destfile fast result tmpdir ts = do
when (M.null outputs) $
giveup "The computation succeeded, but it did not generate any files."
oks <- forM (M.keys outputs) $ \outputfile -> do
case maddaction of
Just addaction -> showAction $
addaction <> " " <> QuotedPath outputfile
Nothing -> noop
k <- catchNonAsync (addfile outputfile)
(\err -> giveup $ "Failed to ingest output file " ++ fromOsPath outputfile ++ ": " ++ show err)
return (outputfile, Just k)
let state' = state
{ Remote.Compute.computeOutputs = M.fromList oks
}
forM_ (mapMaybe snd oks) $ \k -> do
Remote.Compute.setComputeState
(Remote.remoteStateHandle r)
k ts state'
let u = Remote.uuid r
unlessM (elem u <$> loggedLocations k) $
logChange NoLiveUpdate k u InfoPresent
where
state = Remote.Compute.computeState result
outputs = Remote.Compute.computeOutputs state
addfile outputfile
| fast = do
case destfile outputfile of
Nothing -> noop
Just f
| stagefiles -> addSymlink f stateurlk Nothing
| otherwise -> makelink f stateurlk
return stateurlk
| isreproducible = do
sz <- liftIO $ getFileSize outputfile'
metered Nothing sz Nothing $ \_ p ->
case destfile outputfile of
Just f -> ingesthelper f p Nothing
Nothing -> genkey outputfile p
| otherwise = case destfile outputfile of
Just f -> ingesthelper f nullMeterUpdate
(Just stateurlk)
Nothing -> return stateurlk
where
stateurl = Remote.Compute.computeStateUrl r state outputfile
stateurlk = fromUrl stateurl Nothing True
outputfile' = tmpdir </> outputfile
ld f = LockedDown ldc (ks f)
ks f = KeySource
{ keyFilename = f
, contentLocation = outputfile'
, inodeCache = Nothing
}
genkey f p = do
backend <- choosebackend outputfile
fst <$> genKey (ks f) p backend
makelink f k = void $ makeLink f k Nothing
ingesthelper f p mk
| stagefiles = ingestwith $ do
k <- maybe (genkey f p) return mk
ingestAdd' p (Just (ld f)) (Just k)
| otherwise = ingestwith $ do
k <- maybe (genkey f p) return mk
mk' <- fst <$> ingest p (Just (ld f)) (Just k)
maybe noop (makelink f) mk'
return mk'
ingestwith a = a >>= \case
Nothing -> giveup "ingestion failed"
Just k -> do
u <- getUUID
unlessM (elem u <$> loggedLocations k) $
logStatus NoLiveUpdate k InfoPresent
when (fromKey keyVariety k == VURLKey) $ do
hb <- hashBackend
void $ addEquivilantKey hb k
=<< calcRepo (gitAnnexLocation k)
return k
ldc = LockDownConfig
{ lockingFile = True
, hardlinkFileTmpDir = Nothing
, checkWritePerms = True
}
isreproducible = case reproducibleconfig of
Just v -> isReproducible v
Nothing -> Remote.Compute.computeReproducible result
getInputContent :: Bool -> OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath))
getInputContent fast p = catKeyFile p >>= \case
Just inputkey -> getInputContent' fast inputkey filedesc
Nothing -> inRepo (Git.fileRef p) >>= \case
Just fileref -> catObjectMetaData fileref >>= \case
Just (sha, _, t)
| t == Git.BlobObject ->
getInputContent' fast (gitShaKey sha) filedesc
| otherwise ->
badinput $ ", not a git " ++ decodeBS (Git.fmtObjectType t)
Nothing -> notcheckedin
Nothing -> notcheckedin
where
filedesc = fromOsPath p
badinput s = giveup $ "The computation needs an input file " ++ s ++ ": " ++ fromOsPath p
notcheckedin = badinput "that is not checked into the git repository"
getInputContent' :: Bool -> Key -> String -> Annex (Key, Maybe (Either Git.Sha OsPath))
getInputContent' fast inputkey filedesc
| fast = return (inputkey, Nothing)
| otherwise = case keyGitSha inputkey of
Nothing -> ifM (inAnnex inputkey)
( do
obj <- calcRepo (gitAnnexLocation inputkey)
return (inputkey, Just (Right obj))
, giveup $ "The computation needs the content of an annexed file which is not present: " ++ filedesc
)
Just sha -> return (inputkey, Just (Left sha))

View file

@ -26,6 +26,7 @@ import Types.Remote
import Types.Export
import Annex.Export
import Annex.Content
import Annex.GitShaKey
import Annex.Transfer
import Annex.CatFile
import Annex.FileMatcher

View file

@ -213,7 +213,7 @@ storeReceived f = do
warning $ "Received a file " <> QuotedPath f <> " that is not a git-annex key. Deleting this file."
liftIO $ removeWhenExistsWith removeFile f
Just k -> void $ logStatusAfter NoLiveUpdate k $
getViaTmpFromDisk RetrievalVerifiableKeysSecure AlwaysVerify k (AssociatedFile Nothing) $ \dest -> unVerified $
getViaTmpFromDisk RetrievalVerifiableKeysSecure AlwaysVerify k $ \dest -> unVerified $
liftIO $ catchBoolIO $ do
renameFile f dest
return True

View file

@ -128,7 +128,7 @@ linkKey file oldkey newkey = ifM (isJust <$> isAnnexLink file)
- and vulnerable to corruption. -}
linkKey' :: VerifyConfig -> Key -> Key -> Annex Bool
linkKey' v oldkey newkey =
getViaTmpFromDisk RetrievalAllKeysSecure v newkey (AssociatedFile Nothing) $ \tmp -> unVerified $ do
getViaTmpFromDisk RetrievalAllKeysSecure v newkey $ \tmp -> unVerified $ do
oldobj <- calcRepo (gitAnnexLocation oldkey)
isJust <$> linkOrCopy' (return True) newkey oldobj tmp Nothing

209
Command/Recompute.hs Normal file
View file

@ -0,0 +1,209 @@
{- git-annex command
-
- Copyright 2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Command.Recompute where
import Command
import qualified Remote.Compute
import qualified Remote
import qualified Types.Remote as Remote
import qualified Git.Ref as Git
import Annex.Content
import Annex.UUID
import Annex.CatFile
import Annex.GitShaKey
import Git.FilePath
import Logs.Location
import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent, getInputContent', addComputed)
import Backend (maybeLookupBackendVariety, unknownBackendVarietyMessage)
import Types.Key
import qualified Data.Map as M
cmd :: Command
cmd = notBareRepo $
command "recompute" SectionCommon "recompute computed files"
paramPaths (seek <$$> optParser)
data RecomputeOptions = RecomputeOptions
{ recomputeThese :: CmdParams
, originalOption :: Bool
, reproducible :: Maybe Reproducible
, computeRemote :: Maybe (DeferredParse Remote)
}
optParser :: CmdParamsDesc -> Parser RecomputeOptions
optParser desc = RecomputeOptions
<$> cmdParams desc
<*> switch
( long "original"
<> help "recompute using original content of input files"
)
<*> parseReproducible
<*> optional (mkParseRemoteOption <$> parseRemoteOption)
seek :: RecomputeOptions -> CommandSeek
seek o = startConcurrency commandStages (seek' o)
seek' :: RecomputeOptions -> CommandSeek
seek' o = do
computeremote <- maybe (pure Nothing) (Just <$$> getParsed)
(computeRemote o)
let seeker = AnnexedFileSeeker
{ startAction = const $ start o computeremote
, checkContentPresent = Nothing
, usesLocationLog = True
}
withFilesInGitAnnex ww seeker
=<< workTreeItems ww (recomputeThese o)
where
ww = WarnUnmatchLsFiles "recompute"
start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart
start o (Just computeremote) si file key =
stopUnless (elem (Remote.uuid computeremote) <$> loggedLocations key) $
start' o computeremote si file key
start o Nothing si file key = do
rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key)
case sortOn Remote.cost $ filter Remote.Compute.isComputeRemote rs of
[] -> stop
(r:_) -> start' o r si file key
start' :: RecomputeOptions -> Remote -> SeekInput -> OsPath -> Key -> CommandStart
start' o r si file key =
Remote.Compute.getComputeState
(Remote.remoteStateHandle r) key >>= \case
Nothing -> stop
Just state -> shouldrecompute state >>= \case
Nothing -> stop
Just mreason -> starting "recompute" ai si $ do
maybe noop showNote mreason
perform o r file key state
where
ai = mkActionItem (key, file)
shouldrecompute state
| originalOption o = return (Just Nothing)
| otherwise = firstM (inputchanged state)
(M.toList (Remote.Compute.computeInputs state))
>>= return . \case
Nothing -> Nothing
Just (inputfile, _) -> Just $ Just $
QuotedPath inputfile <> " changed"
inputchanged state (inputfile, inputkey) = do
-- Note that the paths from the remote state are not to be
-- trusted to point to a file in the repository, but using
-- the path with git cat-file will only succeed if it
-- is checked into the repository.
p <- fromRepo $ fromTopFilePath $ asTopFilePath $
Remote.Compute.computeSubdir state </> inputfile
case keyGitSha inputkey of
Nothing ->
catKeyFile p >>= return . \case
Just k -> k /= inputkey
Nothing -> inputfilemissing
Just inputgitsha -> inRepo (Git.fileRef p) >>= \case
Just fileref -> catObjectMetaData fileref >>= return . \case
Just (sha, _, _) -> sha /= inputgitsha
Nothing -> inputfilemissing
Nothing -> return inputfilemissing
where
-- When an input file is missing, go ahead and recompute.
-- This way, the user will see the computation fail,
-- with an error message that explains the problem.
-- Or, if the input file is only optionally used by the
-- computation, it might succeed.
inputfilemissing = True
perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform
perform o r file origkey origstate = do
program <- Remote.Compute.getComputeProgram r
reproducibleconfig <- getreproducibleconfig
showOutput
Remote.Compute.runComputeProgram program origstate
(Remote.Compute.ImmutableState False)
(getinputcontent program)
Nothing
(go program reproducibleconfig)
next cleanup
where
go program reproducibleconfig result tmpdir ts = do
checkbehaviorchange program
(Remote.Compute.computeState result)
addComputed Nothing False r reproducibleconfig
choosebackend destfile False result tmpdir ts
checkbehaviorchange program state = do
let check s w a b = forM_ (M.keys (w a)) $ \f ->
unless (M.member f (w b)) $
Remote.Compute.computationBehaviorChangeError program s f
check "not using input file"
Remote.Compute.computeInputs origstate state
check "outputting"
Remote.Compute.computeOutputs state origstate
check "not outputting"
Remote.Compute.computeOutputs origstate state
getinputcontent program p
| originalOption o =
case M.lookup p (Remote.Compute.computeInputs origstate) of
Just inputkey -> getInputContent' False inputkey
(fromOsPath p ++ "(key " ++ serializeKey inputkey ++ ")")
Nothing -> Remote.Compute.computationBehaviorChangeError program
"requesting a new input file" p
| otherwise = getInputContent False p
destfile outputfile
| Just outputfile == origfile = Just file
| otherwise = Nothing
origfile = headMaybe $ M.keys $ M.filter (== Just origkey)
(Remote.Compute.computeOutputs origstate)
origbackendvariety = fromKey keyVariety origkey
recomputingvurl = case origbackendvariety of
VURLKey -> True
_ -> False
getreproducibleconfig = case reproducible o of
Just (Reproducible True) -> return (Just (Reproducible True))
-- A VURL key is used when the computation was
-- unreproducible. So recomputing should too, but that
-- will result in the same VURL key. Since moveAnnex
-- will prefer the current annex object to a new one,
-- delete the annex object first, so that if recomputing
-- generates a new version of the file, it replaces
-- the old version.
v -> if recomputingvurl
then do
lockContentForRemoval origkey noop removeAnnex
return (Just (Reproducible False))
else return v
cleanup = do
case reproducible o of
Just (Reproducible True) -> noop
-- in case computation failed, update
-- location log for removal done earlier
_ -> when recomputingvurl $ do
u <- getUUID
unlessM (elem u <$> loggedLocations origkey) $
logStatus NoLiveUpdate origkey InfoMissing
return True
choosebackend _outputfile
-- Use the same backend as was used to compute it before,
-- so if the computed file is the same, there will be
-- no change.
| otherwise = maybeLookupBackendVariety origbackendvariety >>= \case
Just b -> return b
Nothing -> giveup $ unknownBackendVarietyMessage origbackendvariety

View file

@ -28,7 +28,7 @@ start :: (SeekInput, Key) -> CommandStart
start (_, key) = fieldTransfer Download key $ \_p -> do
-- This matches the retrievalSecurityPolicy of Remote.Git
let rsp = RetrievalAllKeysSecure
ifM (getViaTmp rsp DefaultVerify key (AssociatedFile Nothing) Nothing go)
ifM (getViaTmp rsp DefaultVerify key Nothing go)
( do
logStatus NoLiveUpdate key InfoPresent
_ <- quiesce True

View file

@ -129,7 +129,7 @@ perform src key = do
)
where
move = checkDiskSpaceToGet key Nothing False $
moveAnnex key (AssociatedFile Nothing) src
moveAnnex key src
cleanup :: Key -> CommandCleanup
cleanup key = do

View file

@ -36,7 +36,7 @@ perform file key = do
-- the file might be on a different filesystem, so moveFile is used
-- rather than simply calling moveAnnex; disk space is also
-- checked this way.
ok <- getViaTmp RetrievalAllKeysSecure DefaultVerify key (AssociatedFile Nothing) Nothing $ \dest -> unVerified $
ok <- getViaTmp RetrievalAllKeysSecure DefaultVerify key Nothing $ \dest -> unVerified $
if dest /= file
then liftIO $ catchBoolIO $ do
moveFile file dest

View file

@ -301,7 +301,7 @@ test runannex mkr mkk =
Just verifier -> do
loc <- Annex.calcRepo (gitAnnexLocation k)
verifier k loc
get r k = logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k (AssociatedFile Nothing) Nothing $ \dest ->
get r k = logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k Nothing $ \dest ->
tryNonAsync (Remote.retrieveKeyFile r k (AssociatedFile Nothing) dest nullMeterUpdate (RemoteVerify r)) >>= \case
Right v -> return (True, v)
Left _ -> return (False, UnVerified)
@ -375,13 +375,13 @@ testUnavailable runannex mkr mkk =
, check (`notElem` [Right True, Right False]) "checkPresent" $ \r k ->
Remote.checkPresent r k
, check (== Right False) "retrieveKeyFile" $ \r k ->
logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k (AssociatedFile Nothing) Nothing $ \dest ->
logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k Nothing $ \dest ->
tryNonAsync (Remote.retrieveKeyFile r k (AssociatedFile Nothing) dest nullMeterUpdate (RemoteVerify r)) >>= \case
Right v -> return (True, v)
Left _ -> return (False, UnVerified)
, check (== Right False) "retrieveKeyFileCheap" $ \r k -> case Remote.retrieveKeyFileCheap r of
Nothing -> return False
Just a -> logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k (AssociatedFile Nothing) Nothing $ \dest ->
Just a -> logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k Nothing $ \dest ->
unVerified $ isRight
<$> tryNonAsync (a k (AssociatedFile Nothing) dest)
]
@ -443,7 +443,7 @@ randKey sz = withTmpFile (literalOsPath "randkey") $ \f h -> do
k <- case Types.Backend.genKey Backend.Hash.testKeyBackend of
Just a -> a ks nullMeterUpdate
Nothing -> giveup "failed to generate random key (backend problem)"
_ <- moveAnnex k (AssociatedFile Nothing) f
_ <- moveAnnex k f
return k
getReadonlyKey :: Remote -> OsPath -> Annex Key

View file

@ -63,7 +63,7 @@ toPerform key af remote = go Upload af $
fromPerform :: Key -> AssociatedFile -> Remote -> CommandPerform
fromPerform key af remote = go Upload af $
download' (uuid remote) key af Nothing stdRetry $ \p ->
logStatusAfter NoLiveUpdate key $ getViaTmp (retrievalSecurityPolicy remote) vc key af Nothing $ \t ->
logStatusAfter NoLiveUpdate key $ getViaTmp (retrievalSecurityPolicy remote) vc key Nothing $ \t ->
tryNonAsync (Remote.retrieveKeyFile remote key af t p vc) >>= \case
Right v -> return (True, v)
Left e -> do

View file

@ -50,7 +50,7 @@ start = do
return True
| otherwise = notifyTransfer direction af $
download' (Remote.uuid remote) key af Nothing stdRetry $ \p ->
logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key af Nothing $ \t -> do
logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key Nothing $ \t -> do
r <- tryNonAsync (Remote.retrieveKeyFile remote key af t p (RemoteVerify remote)) >>= \case
Left e -> do
warning (UnquotedString (show e))

View file

@ -55,7 +55,7 @@ start = do
-- so caller is responsible for doing notification
-- and for retrying, and updating location log,
-- and stall canceling.
let go p = getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key af Nothing $ \t -> do
let go p = getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key Nothing $ \t -> do
Remote.verifiedAction (Remote.retrieveKeyFile remote key af t p (RemoteVerify remote))
in download' (Remote.uuid remote) key af Nothing noRetry go
noNotification
@ -72,7 +72,7 @@ start = do
runner (AssistantDownloadRequest _ key (TransferAssociatedFile file)) remote =
notifyTransfer Download file $
download' (Remote.uuid remote) key file Nothing stdRetry $ \p ->
logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key file Nothing $ \t -> do
logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key Nothing $ \t -> do
r <- tryNonAsync (Remote.retrieveKeyFile remote key file t p (RemoteVerify remote)) >>= \case
Left e -> do
warning (UnquotedString (show e))

View file

@ -145,7 +145,7 @@ newtype RefDate = RefDate String
{- Types of objects that can be stored in git. -}
data ObjectType = BlobObject | CommitObject | TreeObject
deriving (Show)
deriving (Show, Eq)
readObjectType :: S.ByteString -> Maybe ObjectType
readObjectType "blob" = Just BlobObject

View file

@ -1,6 +1,6 @@
{- Logs listing keys that are equivalent to a key.
-
- Copyright 2024 Joey Hess <id@joeyh.name>
- Copyright 2024-2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -10,6 +10,9 @@
module Logs.EquivilantKeys (
getEquivilantKeys,
setEquivilantKey,
updateEquivilantKeys,
addEquivilantKey,
generateEquivilantKey,
) where
import Annex.Common
@ -17,6 +20,11 @@ import qualified Annex
import Logs
import Logs.Presence
import qualified Annex.Branch
import qualified Backend.Hash
import Types.KeySource
import Types.Backend
import Types.Remote (Verification(..))
import Utility.Metered
getEquivilantKeys :: Key -> Annex [Key]
getEquivilantKeys key = do
@ -29,3 +37,34 @@ setEquivilantKey key equivkey = do
config <- Annex.getGitConfig
addLog (Annex.Branch.RegardingUUID []) (equivilantKeysLogFile config key)
InfoPresent (LogInfo (serializeKey' equivkey))
-- This returns Verified when when an equivilant key has been added to the
-- log (or was already in the log). This is to avoid hashing the object
-- again later.
updateEquivilantKeys :: Backend -> OsPath -> Key -> [Key] -> Annex (Maybe Verification)
updateEquivilantKeys b obj key eks = generateEquivilantKey b obj >>= \case
Nothing -> return Nothing
Just ek -> do
unless (ek `elem` eks) $
setEquivilantKey key ek
return (Just Verified)
addEquivilantKey :: Backend -> Key -> OsPath -> Annex (Maybe Verification)
addEquivilantKey b key obj =
updateEquivilantKeys b obj key
=<< getEquivilantKeys key
-- The Backend must use a cryptographically secure hash.
generateEquivilantKey :: Backend -> OsPath -> Annex (Maybe Key)
generateEquivilantKey b obj =
case genKey b of
Just genkey -> do
showSideAction (UnquotedString Backend.Hash.descChecksum)
Just <$> genkey source nullMeterUpdate
Nothing -> return Nothing
where
source = KeySource
{ keyFilename = mempty -- avoid adding any extension
, contentLocation = obj
, inodeCache = Nothing
}

View file

@ -81,7 +81,7 @@ runLocal runst runner a = case a of
iv <- startVerifyKeyContentIncrementally DefaultVerify k
let runtransfer ti =
Right <$> transfer download' k af Nothing (\p ->
logStatusAfter NoLiveUpdate k $ getViaTmp rsp DefaultVerify k af Nothing $ \tmp ->
logStatusAfter NoLiveUpdate k $ getViaTmp rsp DefaultVerify k Nothing $ \tmp ->
storefile tmp o l getb iv validitycheck p ti)
let fallback = return $ Left $
ProtoFailureMessage "transfer already in progress, or unable to take transfer lock"

View file

@ -319,22 +319,11 @@ remoteFromUUID u = ifM ((==) u <$> getUUID)
remotesChanged
findinmap
{- Filters a list of remotes to ones that have the listed uuids. -}
remotesWithUUID :: [Remote] -> [UUID] -> [Remote]
remotesWithUUID rs us = filter (\r -> uuid r `elem` us) rs
{- Filters a list of remotes to ones that do not have the listed uuids. -}
remotesWithoutUUID :: [Remote] -> [UUID] -> [Remote]
remotesWithoutUUID rs us = filter (\r -> uuid r `notElem` us) rs
{- List of repository UUIDs that the location log indicates may have a key.
- Dead repositories are excluded. -}
keyLocations :: Key -> Annex [UUID]
keyLocations key = trustExclude DeadTrusted =<< loggedLocations key
{- Whether to include remotes that have annex-ignore set. -}
newtype IncludeIgnored = IncludeIgnored Bool
{- Cost ordered lists of remotes that the location log indicates
- may have a key.
-
@ -342,33 +331,16 @@ newtype IncludeIgnored = IncludeIgnored Bool
-}
keyPossibilities :: IncludeIgnored -> Key -> Annex [Remote]
keyPossibilities ii key = do
u <- getUUID
-- uuids of all remotes that are recorded to have the key
locations <- filter (/= u) <$> keyLocations key
speclocations <- map uuid
. filter (remoteAnnexSpeculatePresent . gitconfig)
<$> remoteList
-- there are unlikely to be many speclocations, so building a Set
-- is not worth the expense
let locations' = speclocations ++ filter (`notElem` speclocations) locations
fst <$> remoteLocations ii locations' []
locations <- keyLocations key
keyPossibilities' ii key locations =<< remoteList
{- Given a list of locations of a key, and a list of all
- trusted repositories, generates a cost-ordered list of
- remotes that contain the key, and a list of trusted locations of the key.
-}
remoteLocations :: IncludeIgnored -> [UUID] -> [UUID] -> Annex ([Remote], [UUID])
remoteLocations (IncludeIgnored ii) locations trusted = do
let validtrustedlocations = nub locations `intersect` trusted
-- remotes that match uuids that have the key
allremotes <- remoteList
>>= if not ii
then filterM (not <$$> liftIO . getDynamicConfig . remoteAnnexIgnore . gitconfig)
else return
let validremotes = remotesWithUUID allremotes locations
return (sortBy (comparing cost) validremotes, validtrustedlocations)
remoteLocations ii locations trusted =
remoteLocations' ii locations trusted =<< remoteList
{- Displays known locations of a key and helps the user take action
- to make them accessible. -}

709
Remote/Compute.hs Normal file
View file

@ -0,0 +1,709 @@
{- Compute remote.
-
- Copyright 2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Remote.Compute (
remote,
isComputeRemote,
ComputeState(..),
setComputeState,
getComputeState,
computeStateUrl,
ComputeProgram,
getComputeProgram,
runComputeProgram,
ImmutableState(..),
ComputeProgramResult(..),
computationBehaviorChangeError,
defaultComputeParams,
) where
import Annex.Common
import qualified Annex
import Types.Remote
import Types.ProposedAccepted
import Types.MetaData
import Types.Creds
import Config
import Config.Cost
import Remote.Helper.Special
import Remote.Helper.ExportImport
import Remote.List.Util
import Annex.SpecialRemote.Config
import Annex.UUID
import Annex.Content
import Annex.Tmp
import Annex.GitShaKey
import Annex.CatFile
import Annex.RepoSize.LiveUpdate
import qualified Annex.Transfer
import Logs.MetaData
import Logs.EquivilantKeys
import Logs.Location
import Messages.Progress
import Utility.Metered
import Utility.TimeStamp
import Utility.Env
import Utility.Tmp.Dir
import Utility.Url
import Utility.MonotonicClock
import Types.Key
import Backend
import qualified Git
import qualified Utility.FileIO as F
import qualified Utility.SimpleProtocol as Proto
import Network.HTTP.Types.URI
import Data.Time.Clock
import Text.Read
import Control.Concurrent.STM
import Control.Concurrent.Async
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.ByteString as B
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
remote :: RemoteType
remote = RemoteType
{ typename = "compute"
, enumerate = const $ findSpecialRemotes "compute"
, generate = gen
, configParser = computeConfigParser
, setup = setupInstance
, exportSupported = exportUnsupported
, importSupported = importUnsupported
, thirdPartyPopulated = False
}
isComputeRemote :: Remote -> Bool
isComputeRemote r = typename (remotetype r) == typename remote
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote)
gen r u rc gc rs = case getComputeProgram' rc of
Left _err -> return Nothing
Right program -> do
c <- parsedRemoteConfig remote rc
cst <- remoteCost gc c veryExpensiveRemoteCost
return $ Just $ mk program c cst
where
mk program c cst = Remote
{ uuid = u
, cost = cst
, name = Git.repoDescribe r
, storeKey = storeKeyUnsupported
, retrieveKeyFile = computeKey rs program
, retrieveKeyFileInOrder = pure True
, retrieveKeyFileCheap = Nothing
, retrievalSecurityPolicy = RetrievalAllKeysSecure
, removeKey = dropKey rs
, lockContent = Nothing
, checkPresent = checkKey rs
, checkPresentCheap = False
, exportActions = exportUnsupported
, importActions = importUnsupported
, whereisKey = Nothing
, remoteFsck = Nothing
, repairRepo = Nothing
, config = c
, gitconfig = gc
, localpath = Nothing
, getRepo = return r
, readonly = True
, appendonly = False
, untrustworthy = False
, availability = pure LocallyAvailable
, remotetype = remote
, mkUnavailable = return Nothing
, getInfo = return []
, claimUrl = Nothing
, checkUrl = Nothing
, remoteStateHandle = rs
}
setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
setupInstance ss mu _ c _ = do
ComputeProgram program <- either giveup return $ getComputeProgram' c
allowedprograms <- maybe [] words . annexAllowedComputePrograms
<$> Annex.getGitConfig
case ss of
Init -> noop
_ -> unless (program `elem` allowedprograms) $ do
let remotename = fromMaybe "(unknown)" (lookupName c)
giveup $ unwords
[ "Unable to enable compute special remote"
, remotename
, "because its compute program"
, program
, "is not listed in annex.security-allowed-compute-programs"
]
unlessM (liftIO $ inSearchPath program) $
giveup $ "Cannot find " ++ program ++ " in PATH"
u <- maybe (liftIO genUUID) return mu
gitConfigSpecialRemote u c [("compute", "true")]
return (c, u)
computeConfigParser :: RemoteConfig -> Annex RemoteConfigParser
computeConfigParser _ = return $ RemoteConfigParser
{ remoteConfigFieldParsers =
[ optionalStringParser programField
(FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")")
]
-- Pass through all other params, which git-annex addcomputed adds
-- to the input params.
, remoteConfigRestPassthrough = Just
( const True
, [("*", FieldDesc "all other parameters are passed to compute program")]
)
}
defaultComputeParams :: Remote -> [String]
defaultComputeParams = map mk . M.toList . getRemoteConfigPassedThrough . config
where
mk (f, v) = fromProposedAccepted f ++ '=' : v
newtype ComputeProgram = ComputeProgram String
deriving (Show)
getComputeProgram :: Remote -> Annex ComputeProgram
getComputeProgram r =
case getComputeProgram' (unparsedRemoteConfig (config r)) of
Right program -> return program
Left err -> giveup $
"Problem with the configuration of compute remote " ++ name r ++ ": " ++ err
getComputeProgram' :: RemoteConfig -> Either String ComputeProgram
getComputeProgram' c = case fromProposedAccepted <$> M.lookup programField c of
Just program
| safetyPrefix `isPrefixOf` program ->
Right (ComputeProgram program)
| otherwise -> Left $
"The program's name must begin with \"" ++ safetyPrefix ++ "\""
Nothing -> Left "Specify program="
-- Limiting the program to "git-annex-compute-" prefix is important for
-- security, it prevents autoenabled compute remotes from running arbitrary
-- programs.
safetyPrefix :: String
safetyPrefix = "git-annex-compute-"
programField :: RemoteConfigField
programField = Accepted "program"
data ProcessCommand
= ProcessInput FilePath
| ProcessOutput FilePath
| ProcessReproducible
| ProcessProgress PercentFloat
deriving (Show, Eq)
instance Proto.Receivable ProcessCommand where
parseCommand "INPUT" = Proto.parse1 ProcessInput
parseCommand "OUTPUT" = Proto.parse1 ProcessOutput
parseCommand "REPRODUCIBLE" = Proto.parse0 ProcessReproducible
parseCommand "PROGRESS" = Proto.parse1 ProcessProgress
parseCommand _ = Proto.parseFail
newtype PercentFloat = PercentFloat Float
deriving (Show, Eq)
instance Proto.Serializable PercentFloat where
serialize (PercentFloat p) = show p ++ "%"
deserialize s = do
s' <- reverse <$> stripPrefix "%" (reverse s)
PercentFloat <$> readMaybe s'
data ComputeState = ComputeState
{ computeParams :: [String]
, computeInputs :: M.Map OsPath Key
, computeOutputs :: M.Map OsPath (Maybe Key)
, computeSubdir :: OsPath
}
deriving (Show, Eq)
{- Formats a ComputeState as an URL query string.
-
- Prefixes computeParams with 'p', computeInputs with 'i',
- and computeOutputs with 'o'. Uses "d" for computeSubdir.
-
- When the passed Key is an output, rather than duplicate it
- in the query string, that output has no value.
-
- Example: "psomefile&pdestfile&pbaz&isomefile=WORM--foo&odestfile=&d=subdir"
-
- The computeParams are in the order they were given. The computeInputs
- and computeOutputs are sorted in ascending order for stability.
-}
formatComputeState :: Key -> ComputeState -> B.ByteString
formatComputeState k = formatComputeState' (Just k)
formatComputeState' :: Maybe Key -> ComputeState -> B.ByteString
formatComputeState' mk st = renderQuery False $ concat
[ map formatparam (computeParams st)
, map formatinput (M.toAscList (computeInputs st))
, mapMaybe formatoutput (M.toAscList (computeOutputs st))
, [("d", Just (fromOsPath (computeSubdir st)))]
]
where
formatparam p = ("p" <> encodeBS p, Nothing)
formatinput (file, key) =
("i" <> fromOsPath file, Just (serializeKey' key))
formatoutput (file, (Just key)) = Just $
("o" <> fromOsPath file,
if Just key == mk
then Nothing
else Just (serializeKey' key)
)
formatoutput (_, Nothing) = Nothing
parseComputeState :: Key -> B.ByteString -> Maybe ComputeState
parseComputeState k b =
let st = go emptycomputestate (parseQuery b)
in if st == emptycomputestate then Nothing else Just st
where
emptycomputestate = ComputeState
{ computeParams = mempty
, computeInputs = mempty
, computeOutputs = mempty
, computeSubdir = literalOsPath "."
}
go :: ComputeState -> [QueryItem] -> ComputeState
go c [] = c { computeParams = reverse (computeParams c) }
go c ((f, v):rest) =
let c' = fromMaybe c $ case decodeBS f of
('p':p) -> Just $ c
{ computeParams = p : computeParams c
}
('i':i) -> do
key <- deserializeKey' =<< v
Just $ c
{ computeInputs =
M.insert (toOsPath i) key
(computeInputs c)
}
('o':o) -> case v of
Just kv -> do
key <- deserializeKey' kv
Just $ c
{ computeOutputs =
M.insert (toOsPath o)
(Just key)
(computeOutputs c)
}
Nothing -> Just $ c
{ computeOutputs =
M.insert (toOsPath o)
(Just k)
(computeOutputs c)
}
('d':[]) -> do
subdir <- v
Just $ c
{ computeSubdir = toOsPath subdir
}
_ -> Nothing
in go c' rest
{- A compute: url for a given output file of a computation. -}
computeStateUrl :: Remote -> ComputeState -> OsPath -> URLString
computeStateUrl r st p =
"annex-compute:" ++ fromUUID (uuid r) ++ "/" ++ fromOsPath p ++ "?"
++ decodeBS (formatComputeState' Nothing st')
where
-- Omit computeOutputs, so this gives the same result whether
-- it's called on a ComputeState with the computeOutputs
-- Keys populated or not.
st' = st { computeOutputs = mempty }
{- The per remote metadata is used to store ComputeState. This allows
- recording multiple ComputeStates that generate the same key.
-
- The metadata fields are numbers (prefixed with "t" to make them legal
- field names), which are estimates of how long it might take to run
- the computation (in seconds).
-
- Avoids redundantly recording a ComputeState when the per remote metadata
- already contains it.
-}
setComputeState :: RemoteStateHandle -> Key -> NominalDiffTime -> ComputeState -> Annex ()
setComputeState rs k ts st = do
l <- map snd <$> getComputeStatesUnsorted rs k
unless (st `elem` l) go
where
go = addRemoteMetaData k rs $ MetaData $ M.singleton
(mkMetaFieldUnchecked $ T.pack ('t':show (truncateResolution 1 ts)))
(S.singleton (MetaValue (CurrentlySet True) (formatComputeState k st)))
{- When multiple ComputeStates have been recorded for the same key,
- this returns one that is probably less expensive to compute,
- based on the original time it took to compute it. -}
getComputeState :: RemoteStateHandle -> Key -> Annex (Maybe ComputeState)
getComputeState rs k = headMaybe . map snd . sortOn fst
<$> getComputeStatesUnsorted rs k
getComputeStatesUnsorted :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)]
getComputeStatesUnsorted rs k = do
RemoteMetaData _ (MetaData m) <- getCurrentRemoteMetaData rs k
return $ go [] (M.toList m)
where
go c [] = concat c
go c ((f, s) : rest) =
let sts = mapMaybe (parseComputeState k . fromMetaValue)
(S.toList s)
in case parsePOSIXTime (T.encodeUtf8 (T.drop 1 (fromMetaField f))) of
Just ts -> go (zip (repeat ts) sts : c) rest
Nothing -> go c rest
computeProgramEnvironment :: ComputeState -> Annex [(String, String)]
computeProgramEnvironment st = do
environ <- filter (caninherit . fst) <$> liftIO getEnvironment
let addenv = mapMaybe go (computeParams st)
return $ environ ++ addenv
where
envprefix = "ANNEX_COMPUTE_"
caninherit v = not (envprefix `isPrefixOf` v)
go p
| '=' `elem` p =
let (f, v) = separate (== '=') p
in Just (envprefix ++ f, v)
| otherwise = Nothing
newtype ImmutableState = ImmutableState Bool
data ComputeProgramResult = ComputeProgramResult
{ computeState :: ComputeState
, computeInputsUnavailable :: Bool
, computeReproducible :: Bool
}
runComputeProgram
:: ComputeProgram
-> ComputeState
-> ImmutableState
-> (OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath)))
-- ^ get input file's content, or Nothing the input file's
-- content is not available
-> Maybe (Key, MeterUpdate)
-- ^ update meter for this key
-> (ComputeProgramResult -> OsPath -> NominalDiffTime -> Annex v)
-> Annex v
runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) getinputcontent meterkey cont =
withOtherTmp $ \othertmpdir ->
withTmpDirIn othertmpdir (literalOsPath "compute") go
where
go tmpdir = do
environ <- computeProgramEnvironment state
subdir <- liftIO $ getsubdir tmpdir
let pr = (proc program (computeParams state))
{ cwd = Just (fromOsPath subdir)
, std_in = CreatePipe
, std_out = CreatePipe
, env = Just environ
}
showOutput
starttime <- liftIO currentMonotonicTimestamp
let startresult = ComputeProgramResult state False False
result <- withmeterfile $ \meterfile -> bracket
(liftIO $ createProcess pr)
(liftIO . cleanupProcess)
(getinput tmpdir subdir startresult meterfile)
endtime <- liftIO currentMonotonicTimestamp
cont result subdir (calcduration starttime endtime)
getsubdir tmpdir = do
let subdir = tmpdir </> computeSubdir state
ifM (dirContains <$> absPath tmpdir <*> absPath subdir)
( do
createDirectoryIfMissing True subdir
return subdir
-- Ignore unsafe value in state.
, return tmpdir
)
getinput tmpdir subdir result meterfile p =
liftIO (hGetLineUntilExitOrEOF (processHandle p) (stdoutHandle p)) >>= \case
Just l
| null l -> getinput tmpdir subdir result meterfile p
| otherwise -> do
result' <- parseoutput p tmpdir subdir result meterfile l
getinput tmpdir subdir result' meterfile p
Nothing -> do
liftIO $ hClose (stdoutHandle p)
liftIO $ hClose (stdinHandle p)
unlessM (liftIO $ checkSuccessProcess (processHandle p)) $
giveup $ program ++ " exited unsuccessfully"
return result
parseoutput p tmpdir subdir result meterfile l = case Proto.parseMessage l of
Just (ProcessInput f) -> do
let f' = toOsPath f
let knowninput = M.member f'
(computeInputs (computeState result))
checksafefile tmpdir subdir f' "input"
checkimmutable knowninput "inputting" f' $ do
(k, inputcontent) <- getinputcontent f'
mp <- case inputcontent of
Nothing -> pure Nothing
Just (Right f'') -> liftIO $
Just <$> relPathDirToFile subdir f''
Just (Left gitsha) ->
Just <$> (liftIO . relPathDirToFile subdir
=<< populategitsha gitsha tmpdir)
liftIO $ hPutStrLn (stdinHandle p) $
maybe "" fromOsPath mp
liftIO $ hFlush (stdinHandle p)
let result' = result
{ computeInputsUnavailable =
isNothing mp || computeInputsUnavailable result
}
return $ if immutablestate
then result'
else modresultstate result' $ \s -> s
{ computeInputs =
M.insert f' k
(computeInputs s)
}
Just (ProcessOutput f) -> do
let f' = toOsPath f
checksafefile tmpdir subdir f' "output"
knownoutput <- case M.lookup f' (computeOutputs $ computeState result) of
Nothing -> return False
Just mk -> do
when (mk == fmap fst meterkey) $
meterfile (subdir </> f')
return True
checkimmutable knownoutput "outputting" f' $
return $ if immutablestate
then result
else modresultstate result $ \s -> s
{ computeOutputs =
M.insert f' Nothing
(computeOutputs s)
}
Just (ProcessProgress percent) -> do
liftIO $ updatepercent percent
return result
Just ProcessReproducible ->
return $ result { computeReproducible = True }
Nothing -> giveup $
program ++ " output an unparseable line: \"" ++ l ++ "\""
modresultstate result f =
result { computeState = f (computeState result) }
checksafefile tmpdir subdir f fileaction = do
let err problem = giveup $
program ++ " tried to " ++ fileaction ++ " a file that is " ++ problem ++ ": " ++ fromOsPath f
unlessM (liftIO $ dirContains <$> absPath tmpdir <*> absPath (subdir </> f)) $
err "outside the git repository"
when (any (\p -> dropTrailingPathSeparator p == literalOsPath ".git") (splitPath f)) $
err "inside the .git directory"
checkimmutable True _ _ a = a
checkimmutable False requestdesc p a
| not immutablestate = a
| otherwise = computationBehaviorChangeError (ComputeProgram program) requestdesc p
calcduration (MonotonicTimestamp starttime) (MonotonicTimestamp endtime) =
fromIntegral (endtime - starttime) :: NominalDiffTime
-- Writes to a .git/objects/ file in the tmpdir, rather than
-- using the input filename, to avoid exposing the input filename
-- to the program as a parameter, which could parse it as a dashed
-- option or other special parameter.
populategitsha gitsha tmpdir = do
let f = tmpdir </> literalOsPath ".git" </> literalOsPath "objects"
</> toOsPath (Git.fromRef' gitsha)
liftIO $ createDirectoryIfMissing True $ takeDirectory f
liftIO . F.writeFile f =<< catObject gitsha
return f
withmeterfile a = case meterkey of
Nothing -> a (const noop)
Just (_, progress) -> do
filev <- liftIO newEmptyTMVarIO
endv <- liftIO $ newEmptyTMVarIO
let meterfile = void . liftIO
. atomically . tryPutTMVar filev
let endmeterfile = atomically $ putTMVar endv ()
tid <- liftIO $ async $ do
v <- liftIO $ atomically $
(Right <$> takeTMVar filev)
`orElse`
(Left <$> takeTMVar endv)
case v of
Right f -> watchFileSize f progress $ \_ ->
void $ liftIO $ atomically $
takeTMVar endv
Left () -> return ()
a meterfile
`finally` liftIO (endmeterfile >> wait tid)
updatepercent (PercentFloat percent) = case meterkey of
Nothing -> noop
Just (k, progress) -> case fromKey keySize k of
Nothing -> noop
Just sz ->
progress $ BytesProcessed $ floor $
fromIntegral sz * percent / 100
computationBehaviorChangeError :: ComputeProgram -> String -> OsPath -> Annex a
computationBehaviorChangeError (ComputeProgram program) requestdesc p =
giveup $ program ++ " is not behaving the same way it used to, now " ++ requestdesc ++ ": " ++ fromOsPath p
computeKey :: RemoteStateHandle -> ComputeProgram -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification
computeKey rs (ComputeProgram program) k _af dest meterupdate vc =
getComputeState rs k >>= \case
Just state ->
case computeskey state of
Just keyfile -> go state keyfile
Nothing -> missingstate
Nothing -> missingstate
where
missingstate = giveup "Missing compute state"
go state keyfile = metered (Just meterupdate) k Nothing $ \_ p ->
runComputeProgram (ComputeProgram program) state
(ImmutableState True)
(getinputcontent state)
(Just (k, p))
(postcompute keyfile)
getinputcontent state f =
case M.lookup f (computeInputs state) of
Just inputkey -> case keyGitSha inputkey of
Nothing ->
let retkey = do
obj <- calcRepo (gitAnnexLocation inputkey)
return (inputkey, Just (Right obj))
in ifM (inAnnex inputkey)
( retkey
, ifM (getinputcontent' f inputkey)
( retkey
, return (inputkey, Nothing)
)
)
Just gitsha ->
return (inputkey, Just (Left gitsha))
Nothing -> error "internal"
getinputcontent' f inputkey = do
remotes <- avoidCycles [k] inputkey
=<< keyPossibilities inputkey
anyM (getinputcontentfrom f inputkey) remotes
getinputcontentfrom f inputkey r = do
showAction $ "getting input " <> QuotedPath f
<> " from " <> UnquotedString (name r)
lu <- prepareLiveUpdate Nothing inputkey AddingKey
logStatusAfter lu inputkey $
Annex.Transfer.download r inputkey (AssociatedFile (Just f))
Annex.Transfer.stdRetry Annex.Transfer.noNotification
computeskey state =
case M.keys $ M.filter (== Just k) (computeOutputs state) of
(keyfile : _) -> Just keyfile
[] -> Nothing
postcompute keyfile result tmpdir _ts
| computeInputsUnavailable result =
giveup "Input file(s) unavailable."
| otherwise =
postcompute' keyfile (computeState result) tmpdir
postcompute' keyfile state tmpdir = do
hb <- hashBackend
let updatevurl key getobj =
if (fromKey keyVariety key == VURLKey)
then addEquivilantKey hb key =<< getobj
else return Nothing
let keyfile' = tmpdir </> keyfile
unlessM (liftIO $ doesFileExist keyfile') $
giveup $ program ++ " exited sucessfully, but failed to write the computed file"
catchNonAsync (liftIO $ moveFile keyfile' dest)
(\err -> giveup $ "failed to move the computed file: " ++ show err)
mverification <- updatevurl k (pure dest)
-- Try to move any other computed object files into the annex.
forM_ (M.toList $ computeOutputs state) $ \case
(file, (Just key)) ->
when (k /= key) $ do
let file' = tmpdir </> file
whenM (liftIO $ doesFileExist file') $ do
whenM (verifyKeyContentPostRetrieval RetrievalAllKeysSecure vc MustVerify key file') $ do
moved <- moveAnnex key file' `catchNonAsync` const (pure False)
when moved $
void $ updatevurl key (calcRepo (gitAnnexLocation key))
_ -> noop
-- The program might not be reproducible,
-- so require strong verification.
return $ fromMaybe MustVerify mverification
keyPossibilities :: Key -> Annex [Remote]
keyPossibilities key = do
remotelist <- Annex.getState Annex.remotes
locs <- loggedLocations key
keyPossibilities' (IncludeIgnored False) key locs remotelist
{- Filter out any remotes that, in order to compute the inputkey, would
- need to get the outputkey from some remote.
-
- This only finds cycles of compute special remotes, not any other
- similar type of special remote that might have its own input keys.
- There are no other such special remotes in git-annex itself, so this
- is the best that can be done.
-
- Note that, in a case where a compute special remote needs the outputkey
- to compute the inputkey, but could get the outputkey from either this
- remote, or some other, non-compute remote, that is filtered out as a
- cycle because it's not possible to prevent that remote getting from this
- remote.
-}
avoidCycles :: [Key] -> Key -> [Remote] -> Annex [Remote]
avoidCycles outputkeys inputkey = filterM go
where
go r
| iscomputeremote r =
getComputeState (remoteStateHandle r) inputkey >>= \case
Nothing -> return True
Just state
| inputsoutput state -> return False
| otherwise -> checkdeeper state
| otherwise = return True
iscomputeremote r = remotetype r == remote
inputsoutput state = not $ M.null $
M.filter (`elem` outputkeys)
(computeInputs state)
checkdeeper state =
flip allM (M.elems (computeInputs state)) $ \inputkey' -> do
rs <- keyPossibilities inputkey'
rs' <- avoidCycles (inputkey:outputkeys) inputkey' rs
return (rs' == rs)
-- Make sure that the compute state exists.
checkKey :: RemoteStateHandle -> Key -> Annex Bool
checkKey rs k = do
states <- getComputeStatesUnsorted rs k
if null states
then giveup "Missing compute state"
else return True
-- Unsetting the compute state will prevent computing the key.
dropKey :: RemoteStateHandle -> Maybe SafeDropProof -> Key -> Annex ()
dropKey rs _ k = do
RemoteMetaData _ old <- getCurrentRemoteMetaData rs k
addRemoteMetaData k rs (modMeta old DelAllMeta)
storeKeyUnsupported :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex ()
storeKeyUnsupported _ _ _ _ = giveup "transfer to compute remote not supported; use git-annex addcomputed instead"

View file

@ -682,7 +682,7 @@ copyToRemote' repo r st@(State connpool duc _ _ _) key af o meterupdate
let checksuccess = liftIO checkio >>= \case
Just err -> giveup err
Nothing -> return True
logStatusAfter NoLiveUpdate key $ Annex.Content.getViaTmp rsp verify key af (Just sz) $ \dest ->
logStatusAfter NoLiveUpdate key $ Annex.Content.getViaTmp rsp verify key (Just sz) $ \dest ->
metered (Just (combineMeterUpdate meterupdate p)) key bwlimit $ \_ p' ->
copier object dest key p' checksuccess verify
)

View file

@ -40,6 +40,7 @@ import qualified Remote.Borg
import qualified Remote.Rclone
import qualified Remote.Hook
import qualified Remote.External
import qualified Remote.Compute
remoteTypes :: [RemoteType]
remoteTypes = map adjustExportImportRemoteType
@ -63,6 +64,7 @@ remoteTypes = map adjustExportImportRemoteType
, Remote.Rclone.remote
, Remote.Hook.remote
, Remote.External.remote
, Remote.Compute.remote
]
{- Builds a list of all Remotes.

View file

@ -1,6 +1,6 @@
{- git-annex remote list utils
-
- Copyright 2011-2020 Joey Hess <id@joeyh.name>
- Copyright 2011-2025 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -10,6 +10,11 @@ module Remote.List.Util where
import Annex.Common
import qualified Annex
import qualified Git.Config
import Annex.UUID
import Types.Remote
import Config.DynamicConfig
import Data.Ord
{- Call when remotes have changed. Re-reads the git config, and
- invalidates the cache so the remoteList will be re-generated next time
@ -22,3 +27,44 @@ remotesChanged = do
, Annex.gitremotes = Nothing
, Annex.repo = newg
}
{- Whether to include remotes that have annex-ignore set. -}
newtype IncludeIgnored = IncludeIgnored Bool
keyPossibilities'
:: IncludeIgnored
-> Key
-> [UUID]
-- ^ uuids of remotes that are recorded to have the key
-> [Remote]
-- ^ all remotes
-> Annex [Remote]
keyPossibilities' ii _key remotelocations rs = do
u <- getUUID
let locations = filter (/= u) remotelocations
let speclocations = map uuid
$ filter (remoteAnnexSpeculatePresent . gitconfig) rs
-- there are unlikely to be many speclocations, so building a Set
-- is not worth the expense
let locations' = speclocations ++ filter (`notElem` speclocations) locations
fst <$> remoteLocations' ii locations' [] rs
remoteLocations' :: IncludeIgnored -> [UUID] -> [UUID] -> [Remote] -> Annex ([Remote], [UUID])
remoteLocations' (IncludeIgnored ii) locations trusted rs = do
let validtrustedlocations = nub locations `intersect` trusted
-- remotes that match uuids that have the key
allremotes <- if not ii
then filterM (not <$$> liftIO . getDynamicConfig . remoteAnnexIgnore . gitconfig) rs
else return rs
let validremotes = remotesWithUUID allremotes locations
return (sortBy (comparing cost) validremotes, validtrustedlocations)
{- Filters a list of remotes to ones that have the listed uuids. -}
remotesWithUUID :: [Remote] -> [UUID] -> [Remote]
remotesWithUUID rs us = filter (\r -> uuid r `elem` us) rs
{- Filters a list of remotes to ones that do not have the listed uuids. -}
remotesWithoutUUID :: [Remote] -> [UUID] -> [Remote]
remotesWithoutUUID rs us = filter (\r -> uuid r `notElem` us) rs

View file

@ -30,7 +30,6 @@ import Annex.SpecialRemote.Config
import Logs.Remote
import Logs.EquivilantKeys
import Backend
import Backend.VURL.Utilities (generateEquivilantKey)
import qualified Data.Map as M
@ -169,18 +168,8 @@ downloadKey urlincludeexclude key _af dest p vc =
| otherwise = return (Just v)
recordvurlkey eks = do
-- Make sure to pick a backend that is cryptographically
-- secure.
db <- defaultBackend
let b = if isCryptographicallySecure db
then db
else defaultHashBackend
generateEquivilantKey b dest >>= \case
Nothing -> return Nothing
Just ek -> do
unless (ek `elem` eks) $
setEquivilantKey key ek
return (Just Verified)
b <- hashBackend
updateEquivilantKeys b dest key eks
uploadKey :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex ()
uploadKey _ _ _ _ = giveup "upload to web not supported"

View file

@ -146,6 +146,7 @@ data GitConfig = GitConfig
, annexAllowedUrlSchemes :: S.Set Scheme
, annexAllowedIPAddresses :: String
, annexAllowUnverifiedDownloads :: Bool
, annexAllowedComputePrograms :: Maybe String
, annexMaxExtensionLength :: Maybe Int
, annexMaxExtensions :: Maybe Int
, annexJobs :: Concurrency
@ -261,6 +262,8 @@ extractGitConfig configsource r = GitConfig
getmaybe (annexConfig "security.allowed-http-addresses") -- old name
, annexAllowUnverifiedDownloads = (== Just "ACKTHPPT") $
getmaybe (annexConfig "security.allow-unverified-downloads")
, annexAllowedComputePrograms =
getmaybe (annexConfig "security.allowed-compute-programs")
, annexMaxExtensionLength = getmayberead (annexConfig "maxextensionlength")
, annexMaxExtensions = getmayberead (annexConfig "maxextensions")
, annexJobs = fromMaybe NonConcurrent $

View file

@ -25,8 +25,7 @@ upgrade = do
olddir <- fromRepo gitAnnexDir
keys <- getKeysPresent0 olddir
forM_ keys $ \k ->
moveAnnex k (AssociatedFile Nothing)
(olddir </> toOsPath (keyFile0 k))
moveAnnex k (olddir </> toOsPath (keyFile0 k))
-- update the symlinks to the key files
-- No longer needed here; V1.upgrade does the same thing

View file

@ -85,7 +85,7 @@ moveContent = do
let d = parentDir f
liftIO $ allowWrite d
liftIO $ allowWrite f
_ <- moveAnnex k (AssociatedFile Nothing) f
_ <- moveAnnex k f
liftIO $ removeDirectory d
updateSymlinks :: Annex ()

View file

@ -23,14 +23,17 @@ that is in the form "foo=bar" will also result in an environment variable
being set, eg `ANNEX_COMPUTE_passes=10` or `ANNEX_COMPUTE_--level=9`.
For security, the program should avoid exposing user input to the shell
unprotected, or otherwise executing it.
unprotected, or otherwise executing it. And when running a command, make
sure that whatever user input is passed to it can result in only safe and
expected behavior.
The program is run in a temporary directory, which will be cleaned up after
it exits.
it exits. Note that it may be run in a subdirectory of a temporary
directory. This is done when `git-annex addcomputed` was run in a subdirectory
of the git repository.
The content of any annexed file in the repository can be an input
to the computation. The program requests an input by writing a line to
stdout:
The content of any file in the repository can be an input to the
computation. The program requests an input by writing a line to stdout:
INPUT file.raw
@ -38,8 +41,8 @@ Then it can read a line from stdin, which will be the path to the content
(eg a `.git/annex/objects/` path).
If the program needs multiple input files, it should output multiple
`INPUT` lines at once, and then read multiple paths from stdin. This
allows retrival of the inputs to potentially run in parallel.
`INPUT` lines first, and then read multiple paths from stdin. This
allows retrieval of the inputs to potentially run in parallel.
If an input file is not available, the program's stdin will be closed
without a path being written to it. So when reading from stdin fails,
@ -90,16 +93,17 @@ An example `git-annex-compute-foo` shell script follows:
#!/bin/sh
set -e
if [ "$1" != "convert" ]; then
echo "Usage: convert input output [passes=n]" >&2
exit 1
echo "Usage: convert input output [passes=n]" >&2
exit 1
fi
if [ -z "$ANNEX_COMPUTE_passes" ];
ANNEX_COMPUTE_passes=1
if [ -z "$ANNEX_COMPUTE_passes" ]; then
ANNEX_COMPUTE_passes=1
fi
echo "INPUT "$2"
echo "INPUT $2"
read input
echo "OUTPUT $3"
echo REPRODUCIBLE
if [ -n "$input" ]; then
frobnicate --passes="$ANNEX_COMPUTE_passes" <"$input" >"$3"
mkdir -p "$(dirname "$3")"
frobnicate --passes="$ANNEX_COMPUTE_passes" <"$input" >"$3"
fi

View file

@ -0,0 +1,103 @@
# NAME
git-annex addcomputed - adds computed files to the repository
# SYNOPSIS
git annex addcomputed `--to=remote -- ...`
# DESCRIPTION
Adds files to the annex that are computed from input files in the
repository, using a compute special remote.
Once a file has been added to a compute remote, commands
like `git-annex get` will use it to compute the content of the file.
The syntax of this command after the `--` is up to the program that
the compute special remote is set up to run to perform the comuptation.
To see the program's usage, you can run:
git-annex addcomputed --to=foo
Generally you will provide an input file (or files), and often also an
output filename, and additional parameters to control the computation.
There can be more than one input file that are combined to compute an
output file. And multiple output files can be computed at the same time.
Some examples of how this might look:
git-annex addcomputed --to=x -- convert file.raw file.jpeg passes=10
git-annex addcomputed --to=y -- compress foo --level=9
git-annex addcomputed --to=z -- clip foo 2:01-3:00 combine with bar to baz
Note that parameters that were passed to `git-annex initremote`
when setting up the compute special remote will be appended to the end of
the parameters provided to `git-annex addcomputed`.
# OPTIONS
* `--to=remote`
Specify which remote will compute the files.
This must be a compute remote. For example, one can be
initialized with:
git-annex initremote photoconv type=compute \
program=git-annex-compute-photoconv
For details about compute remotes, and a list of some
of the programs that are available, see
<https://git-annex.branchable.com/special_remotes/compute/>
* `--fast`
Adds computed files to the repository, without doing any work yet to
compute their content.
This implies `--unreproducible`, because even if the compute remote
produces reproducible output, it's not available.
* `--unreproducible`, `-u`
Indicate that the computation is not expected to be fully reproducible.
It can vary, in ways that produce files that equivilant enough to
be interchangeable, but are not necessarily identical.
This is the default unless the compute remote indicates that it produces
reproducible output.
* `--reproducible`, `-r`
Indicate that the computation is expected to be fully reproducible.
This is the default when the compute remote indicates that it produces
reproducible output (except when using `--fast`).
If a computation turns out not to be fully reproducible, then getting
a computed file from the compute remote will later fail with a
checksum verification error. One thing that can be done then is to use
`git-annex recompute --original --unreproducible`.
* `--backend`
Specifies which key-value backend to use.
* Also the [[git-annex-common-options]](1) can be used.
# SEE ALSO
[[git-annex]](1)
[[git-annex-recompute]](1)
[[git-annex-initremote]](1)
# AUTHOR
Joey Hess <id@joeyh.name>
Warning: Automatically converted into a man page by mdwn2man. Edit with care.

View file

@ -52,7 +52,7 @@ want to use `git annex renameremote`.
git annex initremote mys3 type=S3 --whatelse
For a machine-readable list of the parameters, use this with --json.
For a machine-readable list of the parameters, use this with `--json`.
* `--fast`

View file

@ -0,0 +1,69 @@
# NAME
git-annex recompute - recompute computed files
# SYNOPSIS
git-annex recompute [path ...]`
# DESCRIPTION
This updates computed files that were added with
[[git-annex-addcomputed]](1).
By default, this only recomputes files whose input files have changed.
The new contents of the input files are used to re-run the computation.
When the output of the computation is different, the computed file is
updated with the new content. The updated file is written to the worktree,
but is not staged, in order to avoid overwriting any staged changes.
# OPTIONS
* `--original`
Re-run the computation with the original input files.
* `--remote=name`
Only recompute files that were computed by this compute remote.
When this option is not used, all computed files are recomputed using
whatever compute remote was originally used to add them. (In cases where
a file can be computed by multiple remotes, the one with the lowest
configured cost is used.)
* `--unreproducible`, `-u`
Indicate that the computation is not expected to be fully reproducible.
It can vary, in ways that produce files that equivilant enough to
be interchangeable, but are not necessarily identical.
This is the default unless the compute remote indicates that it produces
reproducible output.
* `--reproducible`, `-r`
Indicate that the computation is expected to be fully reproducible.
This is the default when the compute remote indicates that it produces
reproducible output.
* matching options
The [[git-annex-matching-options]](1) can be used to control what
files to recompute.
* Also the [[git-annex-common-options]](1) can be used.
# SEE ALSO
[[git-annex]](1)
[[git-annex-addcomputed]](1)
# AUTHOR
Joey Hess <id@joeyh.name>
Warning: Automatically converted into a man page by mdwn2man. Edit with care.

View file

@ -186,6 +186,18 @@ content from the key-value store.
See [[git-annex-undo]](1) for details.
* `addcomputed`
Adds computed files to the repository.
See [[git-annex-addcomputed]](1) for details.
* `recompute`
Recomputes computed files.
See [[git-annex-recompute]](1) for details.
* `multicast`
Multicast file distribution.
@ -1945,6 +1957,11 @@ Remotes are configured using these settings in `.git/config`.
the location of the borg repository to use. Normally this is automatically
set up by `git annex initremote`, but you can change it if needed.
* `remote.<name>.annex-compute`
Used to identify compute special remotes.
Normally this is automatically set up by `git annex initremote`.
* `remote.<name>.annex-ddarrepo`
Used by ddar special remotes, this configures
@ -2184,6 +2201,13 @@ Remotes are configured using these settings in `.git/config`.
Per-remote configuration of annex.security.allow-unverified-downloads.
* `annex.security.allowed-compute-programs`
This is a space separated list of compute programs eg
"git-annex-compute-foo git-annex-compute-bar". Listing a compute
program here allows compute special remotes that use that program to be
enabled by `git-annex enableremote` or autoenabled.
# CONFIGURATION OF ASSISTANT
* `annex.delayadd`

View file

@ -11,6 +11,7 @@ the content of files.
* [[Amazon_Glacier|glacier]]
* [[bittorrent]]
* [[bup]]
* [[compute]]
* [[ddar]]
* [[directory]]
* [[gcrypt]] (encrypted git repositories!)

View file

@ -0,0 +1,37 @@
While other remotes store the contents of annexed files somewhere,
this special remote uses a program to compute the contents of annexed
files.
To add a file to a compute special remote, use the [[git-annex-addcomputed]]
command. Once a file has been added to a compute special remote, commands
like `git-annex get` will use it to compute the content of the file.
To enable an instance of this special remote:
# git-annex initremote myremote type=compute program=git-annex-compute-foo
The `program` parameter is the only required parameter. It is the name of the
program to use to compute the contents of annexed files. It must start with
"git-annex-compute-". The program needs to be installed somewhere in the
`PATH`.
Any program can be passed to `git-annex initremote`. However, when enabling
a compute special remote later with `git-annex enableremote` or due to
"autoenable=true", the program must be listed in the git config
`annex.security.allowed-compute-programs`.
All other "field=value" parameters passed to `initremote` will be passed
to the program when running [[git-annex-addcomputed]]. Note that when the
program takes a dashed option, it can be provided after "--":
# git-annex initremote myremote type=compute program=git-annex-compute-foo -- --level=9
## compute programs
To write programs used by the compute special remote, see the
[[design/compute_special_remote_interface]].
Have you written a generally useful (and secure) compute program?
List it here!
* ...

View file

@ -0,0 +1,22 @@
[[!comment format=mdwn
username="joey"
subject="""Re: DataLad exploration of the compute on demand space"""
date="2025-03-06T17:39:04Z"
content="""
Thanks for explaining the design points of datalad-remake. Some
different design choices than I have made, but mostly they strike me as
implementing what is easier/possible from outside git-annex.
Eg, storing the compute inputs under `.datalad` in the branch is fine --
and might even be useful if you want to make a branch that changes
something in there -- but of course in the git-annex implementation it
stores the equvilant thing in the git-annex branch.
I do hope I'm not closing off the design space from such differences
by dropping a compute special remote right into git-annex. But I also
expect that having a standard and easy way for at least simple
computations will lead to a lot of contributions as others use it.
Your fMRI case seems like one that my compute remote could handle well
and easily.
"""]]

View file

@ -0,0 +1,69 @@
[[!comment format=mdwn
username="joey"
subject="""comment 22"""
date="2025-03-06T17:54:50Z"
content="""
I've merged the compute special remote now.
See [[special_remotes/compute]], [[git-annex-addcomputed]]
and [[git-annex-recompute]].
I have opened [[todo/compute_special_remote_remaining_todos]] with
some various ways that I want to improve it further. Including, notably,
computing on inputs from submodules, which is not currently supported at
all.
----
Here I'll go down mih's original and quite useful design criteria and see
how the compute special remote applies to them:
### Generate annex keys (that have never existed)
`git-annex addcomputed --fast`
### Re-generate annex keys
`git-annex addcomputed` optionally with the --reproducible option,
followed by a later `git-annex get`
Another thing that fits under this heading is when one of the original
input files has gotten modified, and you want to compute a new version of
the output file from it, using the same method as was used to compute it
before. That's `git-annex recompute $output_file`
### Worktree provisioning?
This is the main thing I didn't implement. Given that git-annex is working
with large files and needs to support various filesystems and OS's that
lack hardlinks and softlinks, it's hard to do this inexpensively.
Also, it turned out to make sense for the compute program to request
the input files it needs, since this lets git-annex learn what the input
files are, so it can make them available when regenerating a computed file
later. And so the protocol just has git-annex respond with the path to
the content of the file.
### Request one key, receive many
This is supported. (So is using multiple inputs to produce one (or more)
outputs.)
### Instruction deposition
`git-annex addcomputed`
### Storage redundancy tests
It did make sense to have it automatically `git-annex get` the inputs.
Well, I think it makes sense in most cases, this may become a tunable
setting of the compute special remote.
### Trust
Handled by requiring the user install a `git-annex-compute-foo` command
in PATH, and provide the name of the command to `initremote`.
And for later `enableremote` or `autoenable=true`, it will only
allow programs that are listed in the annex.security.allowed-compute-programs
git config.
"""]]

View file

@ -0,0 +1,69 @@
This is the remainder of my todo list while I was building the
compute special remote. --[[Joey]]
* write a tip showing how to use this
* Write some simple compute programs so we have something to start with.
- convert between images eg jpeg to png
- run a command in a singularity container (that is one of the inputs)
- run a wasm binary (that is one of the inputs)
* compute on input files in submodules
* annex.diskreserve can be violated if getting a file computes it but also
some other output files, which get added to the annex.
* would be nice to have a way to see what computations are used by a
compute remote for a file. Put it in `whereis` output? But it's not an
url. Maybe a separate command? That would also allow querying for eg,
what files are inputs for another file. Or it could be exposed in the
Remote interface, and made into a file matching option.
* "getting input from <file>" message uses the original filename,
but that file might have been renamed. Would be more clear to use
whatever file in the tree currently points to the key it's getting
(what if there is not one?)
* allow git-annex enableremote with program= explicitly specified,
without checking annex.security.allowed-compute-programs
* addcomputed should honor annex.addunlocked.
What about recompute? It seems it should either write the new version of
the file as an unlocked file when the old version was unlocked, or also
honor annex.addunlocked.
Problem: Since recompute does not stage the file, it would have to write
the content to the working tree. And then the user would need to
git-annex add. But then, if the key was a VURL key, it would add it with
the default backend instead, and the file would no longer use a computed
key.
So it, seems that, for this to be done, recompute would need to stage the
pointer file.
* recompute could ingest keys for other files than the one being
recomputed, and remember them. Then recomputing those files could just
use those keys, without re-running a computation. (Better than --others
which got removed.)
* `git-annex recompute foo bar baz`, when foo depends on bar which depends
on baz, and when baz has changed, will not recompute foo, because bar has
not changed. It then recomputes bar. So running the command again is
needed to recompute foo.
What it could do is, after it recomputes bar, notice that it already
considered foo, and revisit foo, and recompute it then. It could either
use a bloom filter to remember the files it considered but did not
compute, or it could just notice that the command line includes foo
(or includes a directory that contains foo), and then foo is not
modified.
Or it could build a DAG and traverse it, but building a DAG of a large
directory tree has its own problems.
* Should addcomputed honor annex.smallfiles? That would seem to imply
that recompute should also support recomputing non-annexed files.
Otherwise, adding a file and then recomputing it would vary in
what the content of the file is, depending on annex.smallfiles setting.

View file

@ -562,6 +562,7 @@ Executable git-annex
Annex.FileMatcher
Annex.Fixup
Annex.GitOverlay
Annex.GitShaKey
Annex.HashObject
Annex.Hook
Annex.Import
@ -654,6 +655,7 @@ Executable git-annex
CmdLine.Usage
Command
Command.Add
Command.AddComputed
Command.AddUnused
Command.AddUrl
Command.Adjust
@ -727,6 +729,7 @@ Executable git-annex
Command.Proxy
Command.Pull
Command.Push
Command.Recompute
Command.ReKey
Command.ReadPresentKey
Command.RecvKey
@ -930,6 +933,7 @@ Executable git-annex
Remote.BitTorrent
Remote.Borg
Remote.Bup
Remote.Compute
Remote.Ddar
Remote.Directory
Remote.Directory.LegacyChunked