diff --git a/Annex/AdjustedBranch.hs b/Annex/AdjustedBranch.hs index 99cd40e835..95bd8cfc34 100644 --- a/Annex/AdjustedBranch.hs +++ b/Annex/AdjustedBranch.hs @@ -308,16 +308,12 @@ updateAdjustedBranch adj (AdjBranch currbranch) origbranch -- adjustment is stable. return True -{- Passed an action that, if it succeeds may get or drop the Key associated - - with the file. When the adjusted branch needs to be refreshed to reflect +{- Passed an action that, if it succeeds may get or drop a key. + - When the adjusted branch needs to be refreshed to reflect - those changes, it's handled here. - - - - Note that the AssociatedFile must be verified by this to point to the - - Key. In some cases, the value was provided by the user and might not - - really be an associated file. -} -adjustedBranchRefresh :: AssociatedFile -> Annex a -> Annex a -adjustedBranchRefresh _af a = do +adjustedBranchRefresh :: Annex a -> Annex a +adjustedBranchRefresh a = do r <- a go return r diff --git a/Annex/Content.hs b/Annex/Content.hs index c4a0f8490c..f01432669e 100644 --- a/Annex/Content.hs +++ b/Annex/Content.hs @@ -376,16 +376,16 @@ lockContentUsing contentlocker key fallback a = withContentLockFile key $ \mlock {- Runs an action, passing it the temp file to get, - and if the action succeeds, verifies the file matches - the key and moves the file into the annex as a key's content. -} -getViaTmp :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> AssociatedFile -> Maybe FileSize -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool -getViaTmp rsp v key af sz action = +getViaTmp :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> Maybe FileSize -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool +getViaTmp rsp v key sz action = checkDiskSpaceToGet key sz False $ - getViaTmpFromDisk rsp v key af action + getViaTmpFromDisk rsp v key action {- Like getViaTmp, but does not check that there is enough disk space - for the incoming key. For use when the key content is already on disk - and not being copied into place. -} -getViaTmpFromDisk :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> AssociatedFile -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool -getViaTmpFromDisk rsp v key af action = checkallowed $ do +getViaTmpFromDisk :: RetrievalSecurityPolicy -> VerifyConfig -> Key -> (OsPath -> Annex (Bool, Verification)) -> Annex Bool +getViaTmpFromDisk rsp v key action = checkallowed $ do tmpfile <- prepTmp key resuming <- liftIO $ doesPathExist tmpfile (ok, verification) <- action tmpfile @@ -400,7 +400,7 @@ getViaTmpFromDisk rsp v key af action = checkallowed $ do else verification if ok then ifM (verifyKeyContentPostRetrieval rsp v verification' key tmpfile) - ( pruneTmpWorkDirBefore tmpfile (moveAnnex key af) + ( pruneTmpWorkDirBefore tmpfile (moveAnnex key) , do verificationOfContentFailed tmpfile return False @@ -507,8 +507,8 @@ withTmp key action = do - accepted into the repository. Will display a warning message in this - case. May also throw exceptions in some cases. -} -moveAnnex :: Key -> AssociatedFile -> OsPath -> Annex Bool -moveAnnex key af src = ifM (checkSecureHashes' key) +moveAnnex :: Key -> OsPath -> Annex Bool +moveAnnex key src = ifM (checkSecureHashes' key) ( do #ifdef mingw32_HOST_OS {- Windows prevents deletion of files that are not @@ -523,7 +523,7 @@ moveAnnex key af src = ifM (checkSecureHashes' key) where storeobject dest = ifM (liftIO $ doesPathExist dest) ( alreadyhave - , adjustedBranchRefresh af $ modifyContentDir dest $ do + , adjustedBranchRefresh $ modifyContentDir dest $ do liftIO $ moveFile src dest -- Freeze the object file now that it is in place. -- Waiting until now to freeze it allows for freeze @@ -776,7 +776,7 @@ removeAnnex (ContentRemovalLock key) = withObjectLoc key $ \file -> -- it's unmodified. resetpointer file = unlessM (liftIO $ isSymbolicLink <$> R.getSymbolicLinkStatus (fromOsPath file)) $ ifM (isUnmodified key file) - ( adjustedBranchRefresh (AssociatedFile (Just file)) $ + ( adjustedBranchRefresh $ depopulatePointerFile key file -- Modified file, so leave it alone. -- If it was a hard link to the annex object, diff --git a/Annex/Export.hs b/Annex/Export.hs index 60039ef3b9..4ce30e2fca 100644 --- a/Annex/Export.hs +++ b/Annex/Export.hs @@ -11,16 +11,13 @@ module Annex.Export where import Annex import Annex.CatFile +import Annex.GitShaKey import Types -import Types.Key import qualified Git import qualified Types.Remote as Remote import Git.Quote import Messages -import Data.Maybe -import qualified Data.ByteString.Short as S (fromShort, toShort) - -- From a sha pointing to the content of a file to the key -- to use to export it. When the file is annexed, it's the annexed key. -- When the file is stored in git, it's a special type of key to indicate @@ -31,31 +28,6 @@ exportKey sha = mk <$> catKey sha mk (Just k) = k mk Nothing = gitShaKey sha --- Encodes a git sha as a key. This is used to represent a non-annexed --- file that is stored on a special remote, which necessarily needs a --- key. --- --- This is not the same as a SHA1 key, because the mapping needs to be --- bijective, also because git may not always use SHA1, and because git --- takes a SHA1 of the file size + content, while git-annex SHA1 keys --- only checksum the content. -gitShaKey :: Git.Sha -> Key -gitShaKey (Git.Ref s) = mkKey $ \kd -> kd - { keyName = S.toShort s - , keyVariety = OtherKey "GIT" - } - --- Reverse of gitShaKey -keyGitSha :: Key -> Maybe Git.Sha -keyGitSha k - | fromKey keyVariety k == OtherKey "GIT" = - Just (Git.Ref (S.fromShort (fromKey keyName k))) - | otherwise = Nothing - --- Is a key storing a git sha, and not used for an annexed file? -isGitShaKey :: Key -> Bool -isGitShaKey = isJust . keyGitSha - warnExportImportConflict :: Remote -> Annex () warnExportImportConflict r = do isimport <- Remote.isImportSupported r diff --git a/Annex/GitShaKey.hs b/Annex/GitShaKey.hs new file mode 100644 index 0000000000..1413039c51 --- /dev/null +++ b/Annex/GitShaKey.hs @@ -0,0 +1,41 @@ +{- Encoding a git sha as a Key + - + - Copyright 2017-2025 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings #-} + +module Annex.GitShaKey where + +import Types +import Types.Key +import qualified Git + +import Data.Maybe +import qualified Data.ByteString.Short as S (fromShort, toShort) + +-- Encodes a git sha as a Key. This is used to represent a non-annexed +-- file. For example, when storing a git sha on a special remote. +-- +-- This is not the same as a SHA1 key, because the mapping needs to be +-- bijective, also because git may not always use SHA1, and because git +-- takes a SHA1 of the file size + content, while git-annex SHA1 keys +-- only checksum the content. +gitShaKey :: Git.Sha -> Key +gitShaKey (Git.Ref s) = mkKey $ \kd -> kd + { keyName = S.toShort s + , keyVariety = OtherKey "GIT" + } + +-- Reverse of gitShaKey +keyGitSha :: Key -> Maybe Git.Sha +keyGitSha k + | fromKey keyVariety k == OtherKey "GIT" = + Just (Git.Ref (S.fromShort (fromKey keyName k))) + | otherwise = Nothing + +-- Is a key storing a git sha, and not used for an annexed file? +isGitShaKey :: Key -> Bool +isGitShaKey = isJust . keyGitSha diff --git a/Annex/Import.hs b/Annex/Import.hs index b351504ace..2d2526a544 100644 --- a/Annex/Import.hs +++ b/Annex/Import.hs @@ -38,12 +38,12 @@ import qualified Annex import Annex.Link import Annex.LockFile import Annex.Content -import Annex.Export import Annex.RemoteTrackingBranch import Annex.HashObject import Annex.Transfer import Annex.CheckIgnore import Annex.CatFile +import Annex.GitShaKey import Annex.VectorClock import Annex.SpecialRemote.Config import Command @@ -863,7 +863,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec ia loc [cid] tmpfile (Left k) (combineMeterUpdate p' p) - ok <- moveAnnex k af tmpfile + ok <- moveAnnex k tmpfile when ok $ logStatus NoLiveUpdate k InfoPresent return (Just (k, ok)) @@ -906,7 +906,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec p case keyGitSha k of Nothing -> do - ok <- moveAnnex k af tmpfile + ok <- moveAnnex k tmpfile when ok $ do recordcidkey cidmap cid k logStatus NoLiveUpdate k InfoPresent diff --git a/Annex/Ingest.hs b/Annex/Ingest.hs index 695a0cb063..07b5dad282 100644 --- a/Annex/Ingest.hs +++ b/Annex/Ingest.hs @@ -198,17 +198,11 @@ ingest' preferredbackend meterupdate (Just (LockedDown cfg source)) mk restage = | otherwise = gounlocked key mcache golocked key mcache = - tryNonAsync (moveAnnex key naf (contentLocation source)) >>= \case + tryNonAsync (moveAnnex key (contentLocation source)) >>= \case Right True -> success key mcache Right False -> giveup "failed to add content to annex" Left e -> restoreFile (keyFilename source) key e - -- moveAnnex uses the AssociatedFile provided to it to unlock - -- locked files when getting a file in an adjusted branch. - -- That case does not apply here, where we're adding an unlocked - -- file, so provide it nothing. - naf = AssociatedFile Nothing - gounlocked key (Just cache) = do -- Remove temp directory hard link first because -- linkToAnnex falls back to copying if a file @@ -377,7 +371,7 @@ addAnnexedFile matcher file key mtmp = ifM (addUnlocked matcher mi (isJust mtmp) stagePointerFile file mode =<< hashPointerFile key Database.Keys.addAssociatedFile key =<< inRepo (toTopFilePath file) case mtmp of - Just tmp -> ifM (moveAnnex key af tmp) + Just tmp -> ifM (moveAnnex key tmp) ( linkunlocked mode >> return True , writepointer mode >> return False ) @@ -388,11 +382,10 @@ addAnnexedFile matcher file key mtmp = ifM (addUnlocked matcher mi (isJust mtmp) , do addSymlink file key Nothing case mtmp of - Just tmp -> moveAnnex key af tmp + Just tmp -> moveAnnex key tmp Nothing -> return True ) where - af = AssociatedFile (Just file) mi = case mtmp of Just tmp -> MatchingFile $ FileInfo { contentFile = tmp diff --git a/Annex/Transfer.hs b/Annex/Transfer.hs index 45969003ae..7ec629e442 100644 --- a/Annex/Transfer.hs +++ b/Annex/Transfer.hs @@ -78,7 +78,7 @@ download r key f d witness = Just StallDetectionDisabled -> go Nothing Just sd -> runTransferrer sd r key f d Download witness where - go sd = getViaTmp (Remote.retrievalSecurityPolicy r) vc key f Nothing $ \dest -> + go sd = getViaTmp (Remote.retrievalSecurityPolicy r) vc key Nothing $ \dest -> download' (Remote.uuid r) key f sd d (go' dest) witness go' dest p = verifiedAction $ Remote.retrieveKeyFile r key f dest p vc diff --git a/Backend.hs b/Backend.hs index 4a7ace6524..de4c7bbee8 100644 --- a/Backend.hs +++ b/Backend.hs @@ -10,13 +10,14 @@ module Backend ( builtinList, defaultBackend, - defaultHashBackend, + hashBackend, genKey, getBackend, chooseBackend, lookupBackendVariety, lookupBuiltinBackendVariety, maybeLookupBackendVariety, + unknownBackendVarietyMessage, isStableKey, isCryptographicallySecureKey, isCryptographicallySecure, @@ -54,6 +55,15 @@ defaultBackend = maybe cache return =<< Annex.getState Annex.backend valid name = not (null name) lookupname = lookupBackendVariety . parseKeyVariety . encodeBS +{- A hashing backend. Takes git config into account, but + - guarantees the backend is cryptographically secure. -} +hashBackend :: Annex Backend +hashBackend = do + db <- defaultBackend + return $ if isCryptographicallySecure db + then db + else defaultHashBackend + {- Generates a key for a file. -} genKey :: KeySource -> MeterUpdate -> Backend -> Annex (Key, Backend) genKey source meterupdate b = case B.genKey b of diff --git a/Backend/VURL/Utilities.hs b/Backend/VURL/Utilities.hs index 82e5939e7c..46b06c41b8 100644 --- a/Backend/VURL/Utilities.hs +++ b/Backend/VURL/Utilities.hs @@ -10,10 +10,8 @@ module Backend.VURL.Utilities where import Annex.Common import Types.Key import Types.Backend -import Types.KeySource import Logs.EquivilantKeys import qualified Backend.Hash -import Utility.Metered migrateFromURLToVURL :: Key -> Backend -> AssociatedFile -> Bool -> Annex (Maybe Key) migrateFromURLToVURL oldkey newbackend _af inannex @@ -41,18 +39,3 @@ migrateFromVURLToURL oldkey newbackend _af _ (keyData oldkey) { keyVariety = URLKey } | otherwise = return Nothing - --- The Backend must use a cryptographically secure hash. -generateEquivilantKey :: Backend -> OsPath -> Annex (Maybe Key) -generateEquivilantKey b f = - case genKey b of - Just genkey -> do - showSideAction (UnquotedString Backend.Hash.descChecksum) - Just <$> genkey source nullMeterUpdate - Nothing -> return Nothing - where - source = KeySource - { keyFilename = mempty -- avoid adding any extension - , contentLocation = f - , inodeCache = Nothing - } diff --git a/CHANGELOG b/CHANGELOG index 475277f8f4..8c944a4bfb 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,9 @@ git-annex (10.20250116) UNRELEASED; urgency=medium + * Added the compute special remote. + * addcomputed: New command, adds a file that is generated by a compute + special remote. + * recompute: New command, recomputes computed files. * Support help.autocorrect settings "prompt", "never", and "immediate". * Allow setting remote.foo.annex-tracking-branch to a branch name that contains "/", as long as it's not a remote tracking branch. diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index 6596e269e9..8dc64f8b7b 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -133,6 +133,8 @@ import qualified Command.ExtendCluster import qualified Command.UpdateProxy import qualified Command.MaxSize import qualified Command.Sim +import qualified Command.AddComputed +import qualified Command.Recompute import qualified Command.Version import qualified Command.RemoteDaemon #ifdef WITH_ASSISTANT @@ -265,6 +267,8 @@ cmds testoptparser testrunner mkbenchmarkgenerator = map addGitAnnexCommonOption , Command.UpdateProxy.cmd , Command.MaxSize.cmd , Command.Sim.cmd + , Command.AddComputed.cmd + , Command.Recompute.cmd , Command.Version.cmd , Command.RemoteDaemon.cmd #ifdef WITH_ASSISTANT diff --git a/CmdLine/GitRemoteAnnex.hs b/CmdLine/GitRemoteAnnex.hs index 79d6befd5b..beacd137a3 100644 --- a/CmdLine/GitRemoteAnnex.hs +++ b/CmdLine/GitRemoteAnnex.hs @@ -927,7 +927,7 @@ downloadGitBundle rmt k = getKeyExportLocations rmt k >>= \case getexport loc = catchNonAsync (getexport' loc) (const (pure False)) getexport' loc = - getViaTmp rsp vc k (AssociatedFile Nothing) Nothing $ \tmp -> do + getViaTmp rsp vc k Nothing $ \tmp -> do v <- Remote.retrieveExport (Remote.exportActions rmt) k loc tmp nullMeterUpdate return (True, v) @@ -986,7 +986,7 @@ generateGitBundle rmt bs manifest = tmp nullMeterUpdate if (bundlekey `notElem` inManifest manifest) then do - unlessM (moveAnnex bundlekey (AssociatedFile Nothing) tmp) $ + unlessM (moveAnnex bundlekey tmp) $ giveup "Unable to push" return (bundlekey, uploadaction bundlekey) else return (bundlekey, noop) diff --git a/Command/AddComputed.hs b/Command/AddComputed.hs new file mode 100644 index 0000000000..4774caae9b --- /dev/null +++ b/Command/AddComputed.hs @@ -0,0 +1,236 @@ +{- git-annex command + - + - Copyright 2025 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings #-} + +module Command.AddComputed where + +import Command +import qualified Git +import qualified Git.Types as Git +import qualified Git.Ref as Git +import qualified Annex +import qualified Remote.Compute +import qualified Types.Remote as Remote +import Backend +import Annex.CatFile +import Annex.Content.Presence +import Annex.Ingest +import Annex.UUID +import Annex.GitShaKey +import Types.KeySource +import Types.Key +import Messages.Progress +import Logs.Location +import Logs.EquivilantKeys +import Utility.Metered +import Backend.URL (fromUrl) + +import qualified Data.Map as M +import Data.Time.Clock + +cmd :: Command +cmd = notBareRepo $ withAnnexOptions [backendOption] $ + command "addcomputed" SectionCommon "add computed files to annex" + (paramRepeating paramExpression) + (seek <$$> optParser) + +data AddComputedOptions = AddComputedOptions + { computeParams :: CmdParams + , computeRemote :: DeferredParse Remote + , reproducible :: Maybe Reproducible + } + +optParser :: CmdParamsDesc -> Parser AddComputedOptions +optParser desc = AddComputedOptions + <$> cmdParams desc + <*> (mkParseRemoteOption <$> parseToOption) + <*> parseReproducible + +newtype Reproducible = Reproducible { isReproducible :: Bool } + +parseReproducible :: Parser (Maybe Reproducible) +parseReproducible = r <|> unr + where + r = flag Nothing (Just (Reproducible True)) + ( long "reproducible" + <> short 'r' + <> help "computation is fully reproducible" + ) + unr = flag Nothing (Just (Reproducible False)) + ( long "unreproducible" + <> short 'u' + <> help "computation is not fully reproducible" + ) + +seek :: AddComputedOptions -> CommandSeek +seek o = startConcurrency commandStages (seek' o) + +seek' :: AddComputedOptions -> CommandSeek +seek' o = do + r <- getParsed (computeRemote o) + unless (Remote.Compute.isComputeRemote r) $ + giveup "That is not a compute remote." + + commandAction $ start o r + +start :: AddComputedOptions -> Remote -> CommandStart +start o r = starting "addcomputed" ai si $ perform o r + where + ai = ActionItemUUID (Remote.uuid r) (UnquotedString (Remote.name r)) + si = SeekInput (computeParams o) + +perform :: AddComputedOptions -> Remote -> CommandPerform +perform o r = do + program <- Remote.Compute.getComputeProgram r + repopath <- fromRepo Git.repoPath + subdir <- liftIO $ relPathDirToFile repopath (literalOsPath ".") + let state = Remote.Compute.ComputeState + { Remote.Compute.computeParams = computeParams o ++ + Remote.Compute.defaultComputeParams r + , Remote.Compute.computeInputs = mempty + , Remote.Compute.computeOutputs = mempty + , Remote.Compute.computeSubdir = subdir + } + fast <- Annex.getRead Annex.fast + Remote.Compute.runComputeProgram program state + (Remote.Compute.ImmutableState False) + (getInputContent fast) + Nothing + (addComputed (Just "adding") True r (reproducible o) chooseBackend Just fast) + next $ return True + +addComputed + :: Maybe StringContainingQuotedPath + -> Bool + -> Remote + -> Maybe Reproducible + -> (OsPath -> Annex Backend) + -> (OsPath -> Maybe OsPath) + -> Bool + -> Remote.Compute.ComputeProgramResult + -> OsPath + -> NominalDiffTime + -> Annex () +addComputed maddaction stagefiles r reproducibleconfig choosebackend destfile fast result tmpdir ts = do + when (M.null outputs) $ + giveup "The computation succeeded, but it did not generate any files." + oks <- forM (M.keys outputs) $ \outputfile -> do + case maddaction of + Just addaction -> showAction $ + addaction <> " " <> QuotedPath outputfile + Nothing -> noop + k <- catchNonAsync (addfile outputfile) + (\err -> giveup $ "Failed to ingest output file " ++ fromOsPath outputfile ++ ": " ++ show err) + return (outputfile, Just k) + let state' = state + { Remote.Compute.computeOutputs = M.fromList oks + } + forM_ (mapMaybe snd oks) $ \k -> do + Remote.Compute.setComputeState + (Remote.remoteStateHandle r) + k ts state' + + let u = Remote.uuid r + unlessM (elem u <$> loggedLocations k) $ + logChange NoLiveUpdate k u InfoPresent + where + state = Remote.Compute.computeState result + + outputs = Remote.Compute.computeOutputs state + + addfile outputfile + | fast = do + case destfile outputfile of + Nothing -> noop + Just f + | stagefiles -> addSymlink f stateurlk Nothing + | otherwise -> makelink f stateurlk + return stateurlk + | isreproducible = do + sz <- liftIO $ getFileSize outputfile' + metered Nothing sz Nothing $ \_ p -> + case destfile outputfile of + Just f -> ingesthelper f p Nothing + Nothing -> genkey outputfile p + | otherwise = case destfile outputfile of + Just f -> ingesthelper f nullMeterUpdate + (Just stateurlk) + Nothing -> return stateurlk + where + stateurl = Remote.Compute.computeStateUrl r state outputfile + stateurlk = fromUrl stateurl Nothing True + outputfile' = tmpdir outputfile + ld f = LockedDown ldc (ks f) + ks f = KeySource + { keyFilename = f + , contentLocation = outputfile' + , inodeCache = Nothing + } + genkey f p = do + backend <- choosebackend outputfile + fst <$> genKey (ks f) p backend + makelink f k = void $ makeLink f k Nothing + ingesthelper f p mk + | stagefiles = ingestwith $ do + k <- maybe (genkey f p) return mk + ingestAdd' p (Just (ld f)) (Just k) + | otherwise = ingestwith $ do + k <- maybe (genkey f p) return mk + mk' <- fst <$> ingest p (Just (ld f)) (Just k) + maybe noop (makelink f) mk' + return mk' + ingestwith a = a >>= \case + Nothing -> giveup "ingestion failed" + Just k -> do + u <- getUUID + unlessM (elem u <$> loggedLocations k) $ + logStatus NoLiveUpdate k InfoPresent + when (fromKey keyVariety k == VURLKey) $ do + hb <- hashBackend + void $ addEquivilantKey hb k + =<< calcRepo (gitAnnexLocation k) + return k + + ldc = LockDownConfig + { lockingFile = True + , hardlinkFileTmpDir = Nothing + , checkWritePerms = True + } + + isreproducible = case reproducibleconfig of + Just v -> isReproducible v + Nothing -> Remote.Compute.computeReproducible result + +getInputContent :: Bool -> OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath)) +getInputContent fast p = catKeyFile p >>= \case + Just inputkey -> getInputContent' fast inputkey filedesc + Nothing -> inRepo (Git.fileRef p) >>= \case + Just fileref -> catObjectMetaData fileref >>= \case + Just (sha, _, t) + | t == Git.BlobObject -> + getInputContent' fast (gitShaKey sha) filedesc + | otherwise -> + badinput $ ", not a git " ++ decodeBS (Git.fmtObjectType t) + Nothing -> notcheckedin + Nothing -> notcheckedin + where + filedesc = fromOsPath p + badinput s = giveup $ "The computation needs an input file " ++ s ++ ": " ++ fromOsPath p + notcheckedin = badinput "that is not checked into the git repository" + +getInputContent' :: Bool -> Key -> String -> Annex (Key, Maybe (Either Git.Sha OsPath)) +getInputContent' fast inputkey filedesc + | fast = return (inputkey, Nothing) + | otherwise = case keyGitSha inputkey of + Nothing -> ifM (inAnnex inputkey) + ( do + obj <- calcRepo (gitAnnexLocation inputkey) + return (inputkey, Just (Right obj)) + , giveup $ "The computation needs the content of an annexed file which is not present: " ++ filedesc + ) + Just sha -> return (inputkey, Just (Left sha)) diff --git a/Command/Export.hs b/Command/Export.hs index b4acaac401..3be1a67c93 100644 --- a/Command/Export.hs +++ b/Command/Export.hs @@ -26,6 +26,7 @@ import Types.Remote import Types.Export import Annex.Export import Annex.Content +import Annex.GitShaKey import Annex.Transfer import Annex.CatFile import Annex.FileMatcher diff --git a/Command/Multicast.hs b/Command/Multicast.hs index 280f862fe4..f29db57e47 100644 --- a/Command/Multicast.hs +++ b/Command/Multicast.hs @@ -213,7 +213,7 @@ storeReceived f = do warning $ "Received a file " <> QuotedPath f <> " that is not a git-annex key. Deleting this file." liftIO $ removeWhenExistsWith removeFile f Just k -> void $ logStatusAfter NoLiveUpdate k $ - getViaTmpFromDisk RetrievalVerifiableKeysSecure AlwaysVerify k (AssociatedFile Nothing) $ \dest -> unVerified $ + getViaTmpFromDisk RetrievalVerifiableKeysSecure AlwaysVerify k $ \dest -> unVerified $ liftIO $ catchBoolIO $ do renameFile f dest return True diff --git a/Command/ReKey.hs b/Command/ReKey.hs index 3f02f2ab60..8688dff25c 100644 --- a/Command/ReKey.hs +++ b/Command/ReKey.hs @@ -128,7 +128,7 @@ linkKey file oldkey newkey = ifM (isJust <$> isAnnexLink file) - and vulnerable to corruption. -} linkKey' :: VerifyConfig -> Key -> Key -> Annex Bool linkKey' v oldkey newkey = - getViaTmpFromDisk RetrievalAllKeysSecure v newkey (AssociatedFile Nothing) $ \tmp -> unVerified $ do + getViaTmpFromDisk RetrievalAllKeysSecure v newkey $ \tmp -> unVerified $ do oldobj <- calcRepo (gitAnnexLocation oldkey) isJust <$> linkOrCopy' (return True) newkey oldobj tmp Nothing diff --git a/Command/Recompute.hs b/Command/Recompute.hs new file mode 100644 index 0000000000..6b21ce8ee7 --- /dev/null +++ b/Command/Recompute.hs @@ -0,0 +1,209 @@ +{- git-annex command + - + - Copyright 2025 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings #-} + +module Command.Recompute where + +import Command +import qualified Remote.Compute +import qualified Remote +import qualified Types.Remote as Remote +import qualified Git.Ref as Git +import Annex.Content +import Annex.UUID +import Annex.CatFile +import Annex.GitShaKey +import Git.FilePath +import Logs.Location +import Command.AddComputed (Reproducible(..), parseReproducible, getInputContent, getInputContent', addComputed) +import Backend (maybeLookupBackendVariety, unknownBackendVarietyMessage) +import Types.Key + +import qualified Data.Map as M + +cmd :: Command +cmd = notBareRepo $ + command "recompute" SectionCommon "recompute computed files" + paramPaths (seek <$$> optParser) + +data RecomputeOptions = RecomputeOptions + { recomputeThese :: CmdParams + , originalOption :: Bool + , reproducible :: Maybe Reproducible + , computeRemote :: Maybe (DeferredParse Remote) + } + +optParser :: CmdParamsDesc -> Parser RecomputeOptions +optParser desc = RecomputeOptions + <$> cmdParams desc + <*> switch + ( long "original" + <> help "recompute using original content of input files" + ) + <*> parseReproducible + <*> optional (mkParseRemoteOption <$> parseRemoteOption) + +seek :: RecomputeOptions -> CommandSeek +seek o = startConcurrency commandStages (seek' o) + +seek' :: RecomputeOptions -> CommandSeek +seek' o = do + computeremote <- maybe (pure Nothing) (Just <$$> getParsed) + (computeRemote o) + let seeker = AnnexedFileSeeker + { startAction = const $ start o computeremote + , checkContentPresent = Nothing + , usesLocationLog = True + } + withFilesInGitAnnex ww seeker + =<< workTreeItems ww (recomputeThese o) + where + ww = WarnUnmatchLsFiles "recompute" + +start :: RecomputeOptions -> Maybe Remote -> SeekInput -> OsPath -> Key -> CommandStart +start o (Just computeremote) si file key = + stopUnless (elem (Remote.uuid computeremote) <$> loggedLocations key) $ + start' o computeremote si file key +start o Nothing si file key = do + rs <- catMaybes <$> (mapM Remote.byUUID =<< loggedLocations key) + case sortOn Remote.cost $ filter Remote.Compute.isComputeRemote rs of + [] -> stop + (r:_) -> start' o r si file key + +start' :: RecomputeOptions -> Remote -> SeekInput -> OsPath -> Key -> CommandStart +start' o r si file key = + Remote.Compute.getComputeState + (Remote.remoteStateHandle r) key >>= \case + Nothing -> stop + Just state -> shouldrecompute state >>= \case + Nothing -> stop + Just mreason -> starting "recompute" ai si $ do + maybe noop showNote mreason + perform o r file key state + where + ai = mkActionItem (key, file) + + shouldrecompute state + | originalOption o = return (Just Nothing) + | otherwise = firstM (inputchanged state) + (M.toList (Remote.Compute.computeInputs state)) + >>= return . \case + Nothing -> Nothing + Just (inputfile, _) -> Just $ Just $ + QuotedPath inputfile <> " changed" + + inputchanged state (inputfile, inputkey) = do + -- Note that the paths from the remote state are not to be + -- trusted to point to a file in the repository, but using + -- the path with git cat-file will only succeed if it + -- is checked into the repository. + p <- fromRepo $ fromTopFilePath $ asTopFilePath $ + Remote.Compute.computeSubdir state inputfile + case keyGitSha inputkey of + Nothing -> + catKeyFile p >>= return . \case + Just k -> k /= inputkey + Nothing -> inputfilemissing + Just inputgitsha -> inRepo (Git.fileRef p) >>= \case + Just fileref -> catObjectMetaData fileref >>= return . \case + Just (sha, _, _) -> sha /= inputgitsha + Nothing -> inputfilemissing + Nothing -> return inputfilemissing + where + -- When an input file is missing, go ahead and recompute. + -- This way, the user will see the computation fail, + -- with an error message that explains the problem. + -- Or, if the input file is only optionally used by the + -- computation, it might succeed. + inputfilemissing = True + +perform :: RecomputeOptions -> Remote -> OsPath -> Key -> Remote.Compute.ComputeState -> CommandPerform +perform o r file origkey origstate = do + program <- Remote.Compute.getComputeProgram r + reproducibleconfig <- getreproducibleconfig + showOutput + Remote.Compute.runComputeProgram program origstate + (Remote.Compute.ImmutableState False) + (getinputcontent program) + Nothing + (go program reproducibleconfig) + next cleanup + where + go program reproducibleconfig result tmpdir ts = do + checkbehaviorchange program + (Remote.Compute.computeState result) + addComputed Nothing False r reproducibleconfig + choosebackend destfile False result tmpdir ts + + checkbehaviorchange program state = do + let check s w a b = forM_ (M.keys (w a)) $ \f -> + unless (M.member f (w b)) $ + Remote.Compute.computationBehaviorChangeError program s f + + check "not using input file" + Remote.Compute.computeInputs origstate state + check "outputting" + Remote.Compute.computeOutputs state origstate + check "not outputting" + Remote.Compute.computeOutputs origstate state + + getinputcontent program p + | originalOption o = + case M.lookup p (Remote.Compute.computeInputs origstate) of + Just inputkey -> getInputContent' False inputkey + (fromOsPath p ++ "(key " ++ serializeKey inputkey ++ ")") + Nothing -> Remote.Compute.computationBehaviorChangeError program + "requesting a new input file" p + | otherwise = getInputContent False p + + destfile outputfile + | Just outputfile == origfile = Just file + | otherwise = Nothing + + origfile = headMaybe $ M.keys $ M.filter (== Just origkey) + (Remote.Compute.computeOutputs origstate) + + origbackendvariety = fromKey keyVariety origkey + + recomputingvurl = case origbackendvariety of + VURLKey -> True + _ -> False + + getreproducibleconfig = case reproducible o of + Just (Reproducible True) -> return (Just (Reproducible True)) + -- A VURL key is used when the computation was + -- unreproducible. So recomputing should too, but that + -- will result in the same VURL key. Since moveAnnex + -- will prefer the current annex object to a new one, + -- delete the annex object first, so that if recomputing + -- generates a new version of the file, it replaces + -- the old version. + v -> if recomputingvurl + then do + lockContentForRemoval origkey noop removeAnnex + return (Just (Reproducible False)) + else return v + + cleanup = do + case reproducible o of + Just (Reproducible True) -> noop + -- in case computation failed, update + -- location log for removal done earlier + _ -> when recomputingvurl $ do + u <- getUUID + unlessM (elem u <$> loggedLocations origkey) $ + logStatus NoLiveUpdate origkey InfoMissing + return True + + choosebackend _outputfile + -- Use the same backend as was used to compute it before, + -- so if the computed file is the same, there will be + -- no change. + | otherwise = maybeLookupBackendVariety origbackendvariety >>= \case + Just b -> return b + Nothing -> giveup $ unknownBackendVarietyMessage origbackendvariety diff --git a/Command/RecvKey.hs b/Command/RecvKey.hs index b1cd926236..c3f0eb3289 100644 --- a/Command/RecvKey.hs +++ b/Command/RecvKey.hs @@ -28,7 +28,7 @@ start :: (SeekInput, Key) -> CommandStart start (_, key) = fieldTransfer Download key $ \_p -> do -- This matches the retrievalSecurityPolicy of Remote.Git let rsp = RetrievalAllKeysSecure - ifM (getViaTmp rsp DefaultVerify key (AssociatedFile Nothing) Nothing go) + ifM (getViaTmp rsp DefaultVerify key Nothing go) ( do logStatus NoLiveUpdate key InfoPresent _ <- quiesce True diff --git a/Command/Reinject.hs b/Command/Reinject.hs index 7ea45623fb..0e5d2651d3 100644 --- a/Command/Reinject.hs +++ b/Command/Reinject.hs @@ -129,7 +129,7 @@ perform src key = do ) where move = checkDiskSpaceToGet key Nothing False $ - moveAnnex key (AssociatedFile Nothing) src + moveAnnex key src cleanup :: Key -> CommandCleanup cleanup key = do diff --git a/Command/SetKey.hs b/Command/SetKey.hs index b7db0200df..0026f82295 100644 --- a/Command/SetKey.hs +++ b/Command/SetKey.hs @@ -36,7 +36,7 @@ perform file key = do -- the file might be on a different filesystem, so moveFile is used -- rather than simply calling moveAnnex; disk space is also -- checked this way. - ok <- getViaTmp RetrievalAllKeysSecure DefaultVerify key (AssociatedFile Nothing) Nothing $ \dest -> unVerified $ + ok <- getViaTmp RetrievalAllKeysSecure DefaultVerify key Nothing $ \dest -> unVerified $ if dest /= file then liftIO $ catchBoolIO $ do moveFile file dest diff --git a/Command/TestRemote.hs b/Command/TestRemote.hs index b35ee6ecb2..3bc161d3fe 100644 --- a/Command/TestRemote.hs +++ b/Command/TestRemote.hs @@ -301,7 +301,7 @@ test runannex mkr mkk = Just verifier -> do loc <- Annex.calcRepo (gitAnnexLocation k) verifier k loc - get r k = logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k (AssociatedFile Nothing) Nothing $ \dest -> + get r k = logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k Nothing $ \dest -> tryNonAsync (Remote.retrieveKeyFile r k (AssociatedFile Nothing) dest nullMeterUpdate (RemoteVerify r)) >>= \case Right v -> return (True, v) Left _ -> return (False, UnVerified) @@ -375,13 +375,13 @@ testUnavailable runannex mkr mkk = , check (`notElem` [Right True, Right False]) "checkPresent" $ \r k -> Remote.checkPresent r k , check (== Right False) "retrieveKeyFile" $ \r k -> - logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k (AssociatedFile Nothing) Nothing $ \dest -> + logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k Nothing $ \dest -> tryNonAsync (Remote.retrieveKeyFile r k (AssociatedFile Nothing) dest nullMeterUpdate (RemoteVerify r)) >>= \case Right v -> return (True, v) Left _ -> return (False, UnVerified) , check (== Right False) "retrieveKeyFileCheap" $ \r k -> case Remote.retrieveKeyFileCheap r of Nothing -> return False - Just a -> logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k (AssociatedFile Nothing) Nothing $ \dest -> + Just a -> logStatusAfter NoLiveUpdate k $ getViaTmp (Remote.retrievalSecurityPolicy r) (RemoteVerify r) k Nothing $ \dest -> unVerified $ isRight <$> tryNonAsync (a k (AssociatedFile Nothing) dest) ] @@ -443,7 +443,7 @@ randKey sz = withTmpFile (literalOsPath "randkey") $ \f h -> do k <- case Types.Backend.genKey Backend.Hash.testKeyBackend of Just a -> a ks nullMeterUpdate Nothing -> giveup "failed to generate random key (backend problem)" - _ <- moveAnnex k (AssociatedFile Nothing) f + _ <- moveAnnex k f return k getReadonlyKey :: Remote -> OsPath -> Annex Key diff --git a/Command/TransferKey.hs b/Command/TransferKey.hs index 9732e7d656..2425082305 100644 --- a/Command/TransferKey.hs +++ b/Command/TransferKey.hs @@ -63,7 +63,7 @@ toPerform key af remote = go Upload af $ fromPerform :: Key -> AssociatedFile -> Remote -> CommandPerform fromPerform key af remote = go Upload af $ download' (uuid remote) key af Nothing stdRetry $ \p -> - logStatusAfter NoLiveUpdate key $ getViaTmp (retrievalSecurityPolicy remote) vc key af Nothing $ \t -> + logStatusAfter NoLiveUpdate key $ getViaTmp (retrievalSecurityPolicy remote) vc key Nothing $ \t -> tryNonAsync (Remote.retrieveKeyFile remote key af t p vc) >>= \case Right v -> return (True, v) Left e -> do diff --git a/Command/TransferKeys.hs b/Command/TransferKeys.hs index f06a687c71..07a0051ed0 100644 --- a/Command/TransferKeys.hs +++ b/Command/TransferKeys.hs @@ -50,7 +50,7 @@ start = do return True | otherwise = notifyTransfer direction af $ download' (Remote.uuid remote) key af Nothing stdRetry $ \p -> - logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key af Nothing $ \t -> do + logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key Nothing $ \t -> do r <- tryNonAsync (Remote.retrieveKeyFile remote key af t p (RemoteVerify remote)) >>= \case Left e -> do warning (UnquotedString (show e)) diff --git a/Command/Transferrer.hs b/Command/Transferrer.hs index f84f783597..a87fedd2b2 100644 --- a/Command/Transferrer.hs +++ b/Command/Transferrer.hs @@ -55,7 +55,7 @@ start = do -- so caller is responsible for doing notification -- and for retrying, and updating location log, -- and stall canceling. - let go p = getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key af Nothing $ \t -> do + let go p = getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key Nothing $ \t -> do Remote.verifiedAction (Remote.retrieveKeyFile remote key af t p (RemoteVerify remote)) in download' (Remote.uuid remote) key af Nothing noRetry go noNotification @@ -72,7 +72,7 @@ start = do runner (AssistantDownloadRequest _ key (TransferAssociatedFile file)) remote = notifyTransfer Download file $ download' (Remote.uuid remote) key file Nothing stdRetry $ \p -> - logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key file Nothing $ \t -> do + logStatusAfter NoLiveUpdate key $ getViaTmp (Remote.retrievalSecurityPolicy remote) (RemoteVerify remote) key Nothing $ \t -> do r <- tryNonAsync (Remote.retrieveKeyFile remote key file t p (RemoteVerify remote)) >>= \case Left e -> do warning (UnquotedString (show e)) diff --git a/Git/Types.hs b/Git/Types.hs index a32d07d4f7..1ad145452b 100644 --- a/Git/Types.hs +++ b/Git/Types.hs @@ -145,7 +145,7 @@ newtype RefDate = RefDate String {- Types of objects that can be stored in git. -} data ObjectType = BlobObject | CommitObject | TreeObject - deriving (Show) + deriving (Show, Eq) readObjectType :: S.ByteString -> Maybe ObjectType readObjectType "blob" = Just BlobObject diff --git a/Logs/EquivilantKeys.hs b/Logs/EquivilantKeys.hs index 0a0117301e..b238675724 100644 --- a/Logs/EquivilantKeys.hs +++ b/Logs/EquivilantKeys.hs @@ -1,6 +1,6 @@ {- Logs listing keys that are equivalent to a key. - - - Copyright 2024 Joey Hess + - Copyright 2024-2025 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -10,6 +10,9 @@ module Logs.EquivilantKeys ( getEquivilantKeys, setEquivilantKey, + updateEquivilantKeys, + addEquivilantKey, + generateEquivilantKey, ) where import Annex.Common @@ -17,6 +20,11 @@ import qualified Annex import Logs import Logs.Presence import qualified Annex.Branch +import qualified Backend.Hash +import Types.KeySource +import Types.Backend +import Types.Remote (Verification(..)) +import Utility.Metered getEquivilantKeys :: Key -> Annex [Key] getEquivilantKeys key = do @@ -29,3 +37,34 @@ setEquivilantKey key equivkey = do config <- Annex.getGitConfig addLog (Annex.Branch.RegardingUUID []) (equivilantKeysLogFile config key) InfoPresent (LogInfo (serializeKey' equivkey)) + +-- This returns Verified when when an equivilant key has been added to the +-- log (or was already in the log). This is to avoid hashing the object +-- again later. +updateEquivilantKeys :: Backend -> OsPath -> Key -> [Key] -> Annex (Maybe Verification) +updateEquivilantKeys b obj key eks = generateEquivilantKey b obj >>= \case + Nothing -> return Nothing + Just ek -> do + unless (ek `elem` eks) $ + setEquivilantKey key ek + return (Just Verified) + +addEquivilantKey :: Backend -> Key -> OsPath -> Annex (Maybe Verification) +addEquivilantKey b key obj = + updateEquivilantKeys b obj key + =<< getEquivilantKeys key + +-- The Backend must use a cryptographically secure hash. +generateEquivilantKey :: Backend -> OsPath -> Annex (Maybe Key) +generateEquivilantKey b obj = + case genKey b of + Just genkey -> do + showSideAction (UnquotedString Backend.Hash.descChecksum) + Just <$> genkey source nullMeterUpdate + Nothing -> return Nothing + where + source = KeySource + { keyFilename = mempty -- avoid adding any extension + , contentLocation = obj + , inodeCache = Nothing + } diff --git a/P2P/Annex.hs b/P2P/Annex.hs index a6beb64eb3..15a829550b 100644 --- a/P2P/Annex.hs +++ b/P2P/Annex.hs @@ -81,7 +81,7 @@ runLocal runst runner a = case a of iv <- startVerifyKeyContentIncrementally DefaultVerify k let runtransfer ti = Right <$> transfer download' k af Nothing (\p -> - logStatusAfter NoLiveUpdate k $ getViaTmp rsp DefaultVerify k af Nothing $ \tmp -> + logStatusAfter NoLiveUpdate k $ getViaTmp rsp DefaultVerify k Nothing $ \tmp -> storefile tmp o l getb iv validitycheck p ti) let fallback = return $ Left $ ProtoFailureMessage "transfer already in progress, or unable to take transfer lock" diff --git a/Remote.hs b/Remote.hs index cfe771bb12..ab75383cfa 100644 --- a/Remote.hs +++ b/Remote.hs @@ -319,22 +319,11 @@ remoteFromUUID u = ifM ((==) u <$> getUUID) remotesChanged findinmap -{- Filters a list of remotes to ones that have the listed uuids. -} -remotesWithUUID :: [Remote] -> [UUID] -> [Remote] -remotesWithUUID rs us = filter (\r -> uuid r `elem` us) rs - -{- Filters a list of remotes to ones that do not have the listed uuids. -} -remotesWithoutUUID :: [Remote] -> [UUID] -> [Remote] -remotesWithoutUUID rs us = filter (\r -> uuid r `notElem` us) rs - {- List of repository UUIDs that the location log indicates may have a key. - Dead repositories are excluded. -} keyLocations :: Key -> Annex [UUID] keyLocations key = trustExclude DeadTrusted =<< loggedLocations key -{- Whether to include remotes that have annex-ignore set. -} -newtype IncludeIgnored = IncludeIgnored Bool - {- Cost ordered lists of remotes that the location log indicates - may have a key. - @@ -342,33 +331,16 @@ newtype IncludeIgnored = IncludeIgnored Bool -} keyPossibilities :: IncludeIgnored -> Key -> Annex [Remote] keyPossibilities ii key = do - u <- getUUID - -- uuids of all remotes that are recorded to have the key - locations <- filter (/= u) <$> keyLocations key - speclocations <- map uuid - . filter (remoteAnnexSpeculatePresent . gitconfig) - <$> remoteList - -- there are unlikely to be many speclocations, so building a Set - -- is not worth the expense - let locations' = speclocations ++ filter (`notElem` speclocations) locations - fst <$> remoteLocations ii locations' [] + locations <- keyLocations key + keyPossibilities' ii key locations =<< remoteList {- Given a list of locations of a key, and a list of all - trusted repositories, generates a cost-ordered list of - remotes that contain the key, and a list of trusted locations of the key. -} remoteLocations :: IncludeIgnored -> [UUID] -> [UUID] -> Annex ([Remote], [UUID]) -remoteLocations (IncludeIgnored ii) locations trusted = do - let validtrustedlocations = nub locations `intersect` trusted - - -- remotes that match uuids that have the key - allremotes <- remoteList - >>= if not ii - then filterM (not <$$> liftIO . getDynamicConfig . remoteAnnexIgnore . gitconfig) - else return - let validremotes = remotesWithUUID allremotes locations - - return (sortBy (comparing cost) validremotes, validtrustedlocations) +remoteLocations ii locations trusted = + remoteLocations' ii locations trusted =<< remoteList {- Displays known locations of a key and helps the user take action - to make them accessible. -} diff --git a/Remote/Compute.hs b/Remote/Compute.hs new file mode 100644 index 0000000000..b6ec907bda --- /dev/null +++ b/Remote/Compute.hs @@ -0,0 +1,709 @@ +{- Compute remote. + - + - Copyright 2025 Joey Hess + - + - Licensed under the GNU AGPL version 3 or higher. + -} + +{-# LANGUAGE OverloadedStrings #-} + +module Remote.Compute ( + remote, + isComputeRemote, + ComputeState(..), + setComputeState, + getComputeState, + computeStateUrl, + ComputeProgram, + getComputeProgram, + runComputeProgram, + ImmutableState(..), + ComputeProgramResult(..), + computationBehaviorChangeError, + defaultComputeParams, +) where + +import Annex.Common +import qualified Annex +import Types.Remote +import Types.ProposedAccepted +import Types.MetaData +import Types.Creds +import Config +import Config.Cost +import Remote.Helper.Special +import Remote.Helper.ExportImport +import Remote.List.Util +import Annex.SpecialRemote.Config +import Annex.UUID +import Annex.Content +import Annex.Tmp +import Annex.GitShaKey +import Annex.CatFile +import Annex.RepoSize.LiveUpdate +import qualified Annex.Transfer +import Logs.MetaData +import Logs.EquivilantKeys +import Logs.Location +import Messages.Progress +import Utility.Metered +import Utility.TimeStamp +import Utility.Env +import Utility.Tmp.Dir +import Utility.Url +import Utility.MonotonicClock +import Types.Key +import Backend +import qualified Git +import qualified Utility.FileIO as F +import qualified Utility.SimpleProtocol as Proto + +import Network.HTTP.Types.URI +import Data.Time.Clock +import Text.Read +import Control.Concurrent.STM +import Control.Concurrent.Async +import qualified Data.Map as M +import qualified Data.Set as S +import qualified Data.ByteString as B +import qualified Data.Text as T +import qualified Data.Text.Encoding as T + +remote :: RemoteType +remote = RemoteType + { typename = "compute" + , enumerate = const $ findSpecialRemotes "compute" + , generate = gen + , configParser = computeConfigParser + , setup = setupInstance + , exportSupported = exportUnsupported + , importSupported = importUnsupported + , thirdPartyPopulated = False + } + +isComputeRemote :: Remote -> Bool +isComputeRemote r = typename (remotetype r) == typename remote + +gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> RemoteStateHandle -> Annex (Maybe Remote) +gen r u rc gc rs = case getComputeProgram' rc of + Left _err -> return Nothing + Right program -> do + c <- parsedRemoteConfig remote rc + cst <- remoteCost gc c veryExpensiveRemoteCost + return $ Just $ mk program c cst + where + mk program c cst = Remote + { uuid = u + , cost = cst + , name = Git.repoDescribe r + , storeKey = storeKeyUnsupported + , retrieveKeyFile = computeKey rs program + , retrieveKeyFileInOrder = pure True + , retrieveKeyFileCheap = Nothing + , retrievalSecurityPolicy = RetrievalAllKeysSecure + , removeKey = dropKey rs + , lockContent = Nothing + , checkPresent = checkKey rs + , checkPresentCheap = False + , exportActions = exportUnsupported + , importActions = importUnsupported + , whereisKey = Nothing + , remoteFsck = Nothing + , repairRepo = Nothing + , config = c + , gitconfig = gc + , localpath = Nothing + , getRepo = return r + , readonly = True + , appendonly = False + , untrustworthy = False + , availability = pure LocallyAvailable + , remotetype = remote + , mkUnavailable = return Nothing + , getInfo = return [] + , claimUrl = Nothing + , checkUrl = Nothing + , remoteStateHandle = rs + } + +setupInstance :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID) +setupInstance ss mu _ c _ = do + ComputeProgram program <- either giveup return $ getComputeProgram' c + allowedprograms <- maybe [] words . annexAllowedComputePrograms + <$> Annex.getGitConfig + case ss of + Init -> noop + _ -> unless (program `elem` allowedprograms) $ do + let remotename = fromMaybe "(unknown)" (lookupName c) + giveup $ unwords + [ "Unable to enable compute special remote" + , remotename + , "because its compute program" + , program + , "is not listed in annex.security-allowed-compute-programs" + ] + unlessM (liftIO $ inSearchPath program) $ + giveup $ "Cannot find " ++ program ++ " in PATH" + u <- maybe (liftIO genUUID) return mu + gitConfigSpecialRemote u c [("compute", "true")] + return (c, u) + +computeConfigParser :: RemoteConfig -> Annex RemoteConfigParser +computeConfigParser _ = return $ RemoteConfigParser + { remoteConfigFieldParsers = + [ optionalStringParser programField + (FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")") + ] + -- Pass through all other params, which git-annex addcomputed adds + -- to the input params. + , remoteConfigRestPassthrough = Just + ( const True + , [("*", FieldDesc "all other parameters are passed to compute program")] + ) + } + +defaultComputeParams :: Remote -> [String] +defaultComputeParams = map mk . M.toList . getRemoteConfigPassedThrough . config + where + mk (f, v) = fromProposedAccepted f ++ '=' : v + +newtype ComputeProgram = ComputeProgram String + deriving (Show) + +getComputeProgram :: Remote -> Annex ComputeProgram +getComputeProgram r = + case getComputeProgram' (unparsedRemoteConfig (config r)) of + Right program -> return program + Left err -> giveup $ + "Problem with the configuration of compute remote " ++ name r ++ ": " ++ err + +getComputeProgram' :: RemoteConfig -> Either String ComputeProgram +getComputeProgram' c = case fromProposedAccepted <$> M.lookup programField c of + Just program + | safetyPrefix `isPrefixOf` program -> + Right (ComputeProgram program) + | otherwise -> Left $ + "The program's name must begin with \"" ++ safetyPrefix ++ "\"" + Nothing -> Left "Specify program=" + +-- Limiting the program to "git-annex-compute-" prefix is important for +-- security, it prevents autoenabled compute remotes from running arbitrary +-- programs. +safetyPrefix :: String +safetyPrefix = "git-annex-compute-" + +programField :: RemoteConfigField +programField = Accepted "program" + +data ProcessCommand + = ProcessInput FilePath + | ProcessOutput FilePath + | ProcessReproducible + | ProcessProgress PercentFloat + deriving (Show, Eq) + +instance Proto.Receivable ProcessCommand where + parseCommand "INPUT" = Proto.parse1 ProcessInput + parseCommand "OUTPUT" = Proto.parse1 ProcessOutput + parseCommand "REPRODUCIBLE" = Proto.parse0 ProcessReproducible + parseCommand "PROGRESS" = Proto.parse1 ProcessProgress + parseCommand _ = Proto.parseFail + +newtype PercentFloat = PercentFloat Float + deriving (Show, Eq) + +instance Proto.Serializable PercentFloat where + serialize (PercentFloat p) = show p ++ "%" + deserialize s = do + s' <- reverse <$> stripPrefix "%" (reverse s) + PercentFloat <$> readMaybe s' + +data ComputeState = ComputeState + { computeParams :: [String] + , computeInputs :: M.Map OsPath Key + , computeOutputs :: M.Map OsPath (Maybe Key) + , computeSubdir :: OsPath + } + deriving (Show, Eq) + +{- Formats a ComputeState as an URL query string. + - + - Prefixes computeParams with 'p', computeInputs with 'i', + - and computeOutputs with 'o'. Uses "d" for computeSubdir. + - + - When the passed Key is an output, rather than duplicate it + - in the query string, that output has no value. + - + - Example: "psomefile&pdestfile&pbaz&isomefile=WORM--foo&odestfile=&d=subdir" + - + - The computeParams are in the order they were given. The computeInputs + - and computeOutputs are sorted in ascending order for stability. + -} +formatComputeState :: Key -> ComputeState -> B.ByteString +formatComputeState k = formatComputeState' (Just k) + +formatComputeState' :: Maybe Key -> ComputeState -> B.ByteString +formatComputeState' mk st = renderQuery False $ concat + [ map formatparam (computeParams st) + , map formatinput (M.toAscList (computeInputs st)) + , mapMaybe formatoutput (M.toAscList (computeOutputs st)) + , [("d", Just (fromOsPath (computeSubdir st)))] + ] + where + formatparam p = ("p" <> encodeBS p, Nothing) + formatinput (file, key) = + ("i" <> fromOsPath file, Just (serializeKey' key)) + formatoutput (file, (Just key)) = Just $ + ("o" <> fromOsPath file, + if Just key == mk + then Nothing + else Just (serializeKey' key) + ) + formatoutput (_, Nothing) = Nothing + +parseComputeState :: Key -> B.ByteString -> Maybe ComputeState +parseComputeState k b = + let st = go emptycomputestate (parseQuery b) + in if st == emptycomputestate then Nothing else Just st + where + emptycomputestate = ComputeState + { computeParams = mempty + , computeInputs = mempty + , computeOutputs = mempty + , computeSubdir = literalOsPath "." + } + + go :: ComputeState -> [QueryItem] -> ComputeState + go c [] = c { computeParams = reverse (computeParams c) } + go c ((f, v):rest) = + let c' = fromMaybe c $ case decodeBS f of + ('p':p) -> Just $ c + { computeParams = p : computeParams c + } + ('i':i) -> do + key <- deserializeKey' =<< v + Just $ c + { computeInputs = + M.insert (toOsPath i) key + (computeInputs c) + } + ('o':o) -> case v of + Just kv -> do + key <- deserializeKey' kv + Just $ c + { computeOutputs = + M.insert (toOsPath o) + (Just key) + (computeOutputs c) + } + Nothing -> Just $ c + { computeOutputs = + M.insert (toOsPath o) + (Just k) + (computeOutputs c) + } + ('d':[]) -> do + subdir <- v + Just $ c + { computeSubdir = toOsPath subdir + } + _ -> Nothing + in go c' rest + +{- A compute: url for a given output file of a computation. -} +computeStateUrl :: Remote -> ComputeState -> OsPath -> URLString +computeStateUrl r st p = + "annex-compute:" ++ fromUUID (uuid r) ++ "/" ++ fromOsPath p ++ "?" + ++ decodeBS (formatComputeState' Nothing st') + where + -- Omit computeOutputs, so this gives the same result whether + -- it's called on a ComputeState with the computeOutputs + -- Keys populated or not. + st' = st { computeOutputs = mempty } + +{- The per remote metadata is used to store ComputeState. This allows + - recording multiple ComputeStates that generate the same key. + - + - The metadata fields are numbers (prefixed with "t" to make them legal + - field names), which are estimates of how long it might take to run + - the computation (in seconds). + - + - Avoids redundantly recording a ComputeState when the per remote metadata + - already contains it. + -} +setComputeState :: RemoteStateHandle -> Key -> NominalDiffTime -> ComputeState -> Annex () +setComputeState rs k ts st = do + l <- map snd <$> getComputeStatesUnsorted rs k + unless (st `elem` l) go + where + go = addRemoteMetaData k rs $ MetaData $ M.singleton + (mkMetaFieldUnchecked $ T.pack ('t':show (truncateResolution 1 ts))) + (S.singleton (MetaValue (CurrentlySet True) (formatComputeState k st))) + +{- When multiple ComputeStates have been recorded for the same key, + - this returns one that is probably less expensive to compute, + - based on the original time it took to compute it. -} +getComputeState :: RemoteStateHandle -> Key -> Annex (Maybe ComputeState) +getComputeState rs k = headMaybe . map snd . sortOn fst + <$> getComputeStatesUnsorted rs k + +getComputeStatesUnsorted :: RemoteStateHandle -> Key -> Annex [(NominalDiffTime, ComputeState)] +getComputeStatesUnsorted rs k = do + RemoteMetaData _ (MetaData m) <- getCurrentRemoteMetaData rs k + return $ go [] (M.toList m) + where + go c [] = concat c + go c ((f, s) : rest) = + let sts = mapMaybe (parseComputeState k . fromMetaValue) + (S.toList s) + in case parsePOSIXTime (T.encodeUtf8 (T.drop 1 (fromMetaField f))) of + Just ts -> go (zip (repeat ts) sts : c) rest + Nothing -> go c rest + +computeProgramEnvironment :: ComputeState -> Annex [(String, String)] +computeProgramEnvironment st = do + environ <- filter (caninherit . fst) <$> liftIO getEnvironment + let addenv = mapMaybe go (computeParams st) + return $ environ ++ addenv + where + envprefix = "ANNEX_COMPUTE_" + caninherit v = not (envprefix `isPrefixOf` v) + go p + | '=' `elem` p = + let (f, v) = separate (== '=') p + in Just (envprefix ++ f, v) + | otherwise = Nothing + +newtype ImmutableState = ImmutableState Bool + +data ComputeProgramResult = ComputeProgramResult + { computeState :: ComputeState + , computeInputsUnavailable :: Bool + , computeReproducible :: Bool + } + +runComputeProgram + :: ComputeProgram + -> ComputeState + -> ImmutableState + -> (OsPath -> Annex (Key, Maybe (Either Git.Sha OsPath))) + -- ^ get input file's content, or Nothing the input file's + -- content is not available + -> Maybe (Key, MeterUpdate) + -- ^ update meter for this key + -> (ComputeProgramResult -> OsPath -> NominalDiffTime -> Annex v) + -> Annex v +runComputeProgram (ComputeProgram program) state (ImmutableState immutablestate) getinputcontent meterkey cont = + withOtherTmp $ \othertmpdir -> + withTmpDirIn othertmpdir (literalOsPath "compute") go + where + go tmpdir = do + environ <- computeProgramEnvironment state + subdir <- liftIO $ getsubdir tmpdir + let pr = (proc program (computeParams state)) + { cwd = Just (fromOsPath subdir) + , std_in = CreatePipe + , std_out = CreatePipe + , env = Just environ + } + showOutput + starttime <- liftIO currentMonotonicTimestamp + let startresult = ComputeProgramResult state False False + result <- withmeterfile $ \meterfile -> bracket + (liftIO $ createProcess pr) + (liftIO . cleanupProcess) + (getinput tmpdir subdir startresult meterfile) + endtime <- liftIO currentMonotonicTimestamp + cont result subdir (calcduration starttime endtime) + + getsubdir tmpdir = do + let subdir = tmpdir computeSubdir state + ifM (dirContains <$> absPath tmpdir <*> absPath subdir) + ( do + createDirectoryIfMissing True subdir + return subdir + -- Ignore unsafe value in state. + , return tmpdir + ) + + getinput tmpdir subdir result meterfile p = + liftIO (hGetLineUntilExitOrEOF (processHandle p) (stdoutHandle p)) >>= \case + Just l + | null l -> getinput tmpdir subdir result meterfile p + | otherwise -> do + result' <- parseoutput p tmpdir subdir result meterfile l + getinput tmpdir subdir result' meterfile p + Nothing -> do + liftIO $ hClose (stdoutHandle p) + liftIO $ hClose (stdinHandle p) + unlessM (liftIO $ checkSuccessProcess (processHandle p)) $ + giveup $ program ++ " exited unsuccessfully" + return result + + parseoutput p tmpdir subdir result meterfile l = case Proto.parseMessage l of + Just (ProcessInput f) -> do + let f' = toOsPath f + let knowninput = M.member f' + (computeInputs (computeState result)) + checksafefile tmpdir subdir f' "input" + checkimmutable knowninput "inputting" f' $ do + (k, inputcontent) <- getinputcontent f' + mp <- case inputcontent of + Nothing -> pure Nothing + Just (Right f'') -> liftIO $ + Just <$> relPathDirToFile subdir f'' + Just (Left gitsha) -> + Just <$> (liftIO . relPathDirToFile subdir + =<< populategitsha gitsha tmpdir) + liftIO $ hPutStrLn (stdinHandle p) $ + maybe "" fromOsPath mp + liftIO $ hFlush (stdinHandle p) + let result' = result + { computeInputsUnavailable = + isNothing mp || computeInputsUnavailable result + } + return $ if immutablestate + then result' + else modresultstate result' $ \s -> s + { computeInputs = + M.insert f' k + (computeInputs s) + } + Just (ProcessOutput f) -> do + let f' = toOsPath f + checksafefile tmpdir subdir f' "output" + knownoutput <- case M.lookup f' (computeOutputs $ computeState result) of + Nothing -> return False + Just mk -> do + when (mk == fmap fst meterkey) $ + meterfile (subdir f') + return True + checkimmutable knownoutput "outputting" f' $ + return $ if immutablestate + then result + else modresultstate result $ \s -> s + { computeOutputs = + M.insert f' Nothing + (computeOutputs s) + } + Just (ProcessProgress percent) -> do + liftIO $ updatepercent percent + return result + Just ProcessReproducible -> + return $ result { computeReproducible = True } + Nothing -> giveup $ + program ++ " output an unparseable line: \"" ++ l ++ "\"" + + modresultstate result f = + result { computeState = f (computeState result) } + + checksafefile tmpdir subdir f fileaction = do + let err problem = giveup $ + program ++ " tried to " ++ fileaction ++ " a file that is " ++ problem ++ ": " ++ fromOsPath f + unlessM (liftIO $ dirContains <$> absPath tmpdir <*> absPath (subdir f)) $ + err "outside the git repository" + when (any (\p -> dropTrailingPathSeparator p == literalOsPath ".git") (splitPath f)) $ + err "inside the .git directory" + + checkimmutable True _ _ a = a + checkimmutable False requestdesc p a + | not immutablestate = a + | otherwise = computationBehaviorChangeError (ComputeProgram program) requestdesc p + + calcduration (MonotonicTimestamp starttime) (MonotonicTimestamp endtime) = + fromIntegral (endtime - starttime) :: NominalDiffTime + + -- Writes to a .git/objects/ file in the tmpdir, rather than + -- using the input filename, to avoid exposing the input filename + -- to the program as a parameter, which could parse it as a dashed + -- option or other special parameter. + populategitsha gitsha tmpdir = do + let f = tmpdir literalOsPath ".git" literalOsPath "objects" + toOsPath (Git.fromRef' gitsha) + liftIO $ createDirectoryIfMissing True $ takeDirectory f + liftIO . F.writeFile f =<< catObject gitsha + return f + + withmeterfile a = case meterkey of + Nothing -> a (const noop) + Just (_, progress) -> do + filev <- liftIO newEmptyTMVarIO + endv <- liftIO $ newEmptyTMVarIO + let meterfile = void . liftIO + . atomically . tryPutTMVar filev + let endmeterfile = atomically $ putTMVar endv () + tid <- liftIO $ async $ do + v <- liftIO $ atomically $ + (Right <$> takeTMVar filev) + `orElse` + (Left <$> takeTMVar endv) + case v of + Right f -> watchFileSize f progress $ \_ -> + void $ liftIO $ atomically $ + takeTMVar endv + Left () -> return () + a meterfile + `finally` liftIO (endmeterfile >> wait tid) + + updatepercent (PercentFloat percent) = case meterkey of + Nothing -> noop + Just (k, progress) -> case fromKey keySize k of + Nothing -> noop + Just sz -> + progress $ BytesProcessed $ floor $ + fromIntegral sz * percent / 100 + +computationBehaviorChangeError :: ComputeProgram -> String -> OsPath -> Annex a +computationBehaviorChangeError (ComputeProgram program) requestdesc p = + giveup $ program ++ " is not behaving the same way it used to, now " ++ requestdesc ++ ": " ++ fromOsPath p + +computeKey :: RemoteStateHandle -> ComputeProgram -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification +computeKey rs (ComputeProgram program) k _af dest meterupdate vc = + getComputeState rs k >>= \case + Just state -> + case computeskey state of + Just keyfile -> go state keyfile + Nothing -> missingstate + Nothing -> missingstate + where + missingstate = giveup "Missing compute state" + + go state keyfile = metered (Just meterupdate) k Nothing $ \_ p -> + runComputeProgram (ComputeProgram program) state + (ImmutableState True) + (getinputcontent state) + (Just (k, p)) + (postcompute keyfile) + + getinputcontent state f = + case M.lookup f (computeInputs state) of + Just inputkey -> case keyGitSha inputkey of + Nothing -> + let retkey = do + obj <- calcRepo (gitAnnexLocation inputkey) + return (inputkey, Just (Right obj)) + in ifM (inAnnex inputkey) + ( retkey + , ifM (getinputcontent' f inputkey) + ( retkey + , return (inputkey, Nothing) + ) + ) + Just gitsha -> + return (inputkey, Just (Left gitsha)) + Nothing -> error "internal" + + getinputcontent' f inputkey = do + remotes <- avoidCycles [k] inputkey + =<< keyPossibilities inputkey + anyM (getinputcontentfrom f inputkey) remotes + + getinputcontentfrom f inputkey r = do + showAction $ "getting input " <> QuotedPath f + <> " from " <> UnquotedString (name r) + lu <- prepareLiveUpdate Nothing inputkey AddingKey + logStatusAfter lu inputkey $ + Annex.Transfer.download r inputkey (AssociatedFile (Just f)) + Annex.Transfer.stdRetry Annex.Transfer.noNotification + + computeskey state = + case M.keys $ M.filter (== Just k) (computeOutputs state) of + (keyfile : _) -> Just keyfile + [] -> Nothing + + postcompute keyfile result tmpdir _ts + | computeInputsUnavailable result = + giveup "Input file(s) unavailable." + | otherwise = + postcompute' keyfile (computeState result) tmpdir + + postcompute' keyfile state tmpdir = do + hb <- hashBackend + let updatevurl key getobj = + if (fromKey keyVariety key == VURLKey) + then addEquivilantKey hb key =<< getobj + else return Nothing + + let keyfile' = tmpdir keyfile + unlessM (liftIO $ doesFileExist keyfile') $ + giveup $ program ++ " exited sucessfully, but failed to write the computed file" + catchNonAsync (liftIO $ moveFile keyfile' dest) + (\err -> giveup $ "failed to move the computed file: " ++ show err) + mverification <- updatevurl k (pure dest) + + -- Try to move any other computed object files into the annex. + forM_ (M.toList $ computeOutputs state) $ \case + (file, (Just key)) -> + when (k /= key) $ do + let file' = tmpdir file + whenM (liftIO $ doesFileExist file') $ do + whenM (verifyKeyContentPostRetrieval RetrievalAllKeysSecure vc MustVerify key file') $ do + moved <- moveAnnex key file' `catchNonAsync` const (pure False) + when moved $ + void $ updatevurl key (calcRepo (gitAnnexLocation key)) + _ -> noop + + -- The program might not be reproducible, + -- so require strong verification. + return $ fromMaybe MustVerify mverification + +keyPossibilities :: Key -> Annex [Remote] +keyPossibilities key = do + remotelist <- Annex.getState Annex.remotes + locs <- loggedLocations key + keyPossibilities' (IncludeIgnored False) key locs remotelist + +{- Filter out any remotes that, in order to compute the inputkey, would + - need to get the outputkey from some remote. + - + - This only finds cycles of compute special remotes, not any other + - similar type of special remote that might have its own input keys. + - There are no other such special remotes in git-annex itself, so this + - is the best that can be done. + - + - Note that, in a case where a compute special remote needs the outputkey + - to compute the inputkey, but could get the outputkey from either this + - remote, or some other, non-compute remote, that is filtered out as a + - cycle because it's not possible to prevent that remote getting from this + - remote. + -} +avoidCycles :: [Key] -> Key -> [Remote] -> Annex [Remote] +avoidCycles outputkeys inputkey = filterM go + where + go r + | iscomputeremote r = + getComputeState (remoteStateHandle r) inputkey >>= \case + Nothing -> return True + Just state + | inputsoutput state -> return False + | otherwise -> checkdeeper state + | otherwise = return True + + iscomputeremote r = remotetype r == remote + + inputsoutput state = not $ M.null $ + M.filter (`elem` outputkeys) + (computeInputs state) + + checkdeeper state = + flip allM (M.elems (computeInputs state)) $ \inputkey' -> do + rs <- keyPossibilities inputkey' + rs' <- avoidCycles (inputkey:outputkeys) inputkey' rs + return (rs' == rs) + +-- Make sure that the compute state exists. +checkKey :: RemoteStateHandle -> Key -> Annex Bool +checkKey rs k = do + states <- getComputeStatesUnsorted rs k + if null states + then giveup "Missing compute state" + else return True + +-- Unsetting the compute state will prevent computing the key. +dropKey :: RemoteStateHandle -> Maybe SafeDropProof -> Key -> Annex () +dropKey rs _ k = do + RemoteMetaData _ old <- getCurrentRemoteMetaData rs k + addRemoteMetaData k rs (modMeta old DelAllMeta) + +storeKeyUnsupported :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex () +storeKeyUnsupported _ _ _ _ = giveup "transfer to compute remote not supported; use git-annex addcomputed instead" diff --git a/Remote/Git.hs b/Remote/Git.hs index 71c6571554..cda705cb0e 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -682,7 +682,7 @@ copyToRemote' repo r st@(State connpool duc _ _ _) key af o meterupdate let checksuccess = liftIO checkio >>= \case Just err -> giveup err Nothing -> return True - logStatusAfter NoLiveUpdate key $ Annex.Content.getViaTmp rsp verify key af (Just sz) $ \dest -> + logStatusAfter NoLiveUpdate key $ Annex.Content.getViaTmp rsp verify key (Just sz) $ \dest -> metered (Just (combineMeterUpdate meterupdate p)) key bwlimit $ \_ p' -> copier object dest key p' checksuccess verify ) diff --git a/Remote/List.hs b/Remote/List.hs index a266f2d2f2..9d39ddd81d 100644 --- a/Remote/List.hs +++ b/Remote/List.hs @@ -40,6 +40,7 @@ import qualified Remote.Borg import qualified Remote.Rclone import qualified Remote.Hook import qualified Remote.External +import qualified Remote.Compute remoteTypes :: [RemoteType] remoteTypes = map adjustExportImportRemoteType @@ -63,6 +64,7 @@ remoteTypes = map adjustExportImportRemoteType , Remote.Rclone.remote , Remote.Hook.remote , Remote.External.remote + , Remote.Compute.remote ] {- Builds a list of all Remotes. diff --git a/Remote/List/Util.hs b/Remote/List/Util.hs index 382a98fa5d..e022d23190 100644 --- a/Remote/List/Util.hs +++ b/Remote/List/Util.hs @@ -1,6 +1,6 @@ {- git-annex remote list utils - - - Copyright 2011-2020 Joey Hess + - Copyright 2011-2025 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -10,6 +10,11 @@ module Remote.List.Util where import Annex.Common import qualified Annex import qualified Git.Config +import Annex.UUID +import Types.Remote +import Config.DynamicConfig + +import Data.Ord {- Call when remotes have changed. Re-reads the git config, and - invalidates the cache so the remoteList will be re-generated next time @@ -22,3 +27,44 @@ remotesChanged = do , Annex.gitremotes = Nothing , Annex.repo = newg } + +{- Whether to include remotes that have annex-ignore set. -} +newtype IncludeIgnored = IncludeIgnored Bool + +keyPossibilities' + :: IncludeIgnored + -> Key + -> [UUID] + -- ^ uuids of remotes that are recorded to have the key + -> [Remote] + -- ^ all remotes + -> Annex [Remote] +keyPossibilities' ii _key remotelocations rs = do + u <- getUUID + let locations = filter (/= u) remotelocations + let speclocations = map uuid + $ filter (remoteAnnexSpeculatePresent . gitconfig) rs + -- there are unlikely to be many speclocations, so building a Set + -- is not worth the expense + let locations' = speclocations ++ filter (`notElem` speclocations) locations + fst <$> remoteLocations' ii locations' [] rs + +remoteLocations' :: IncludeIgnored -> [UUID] -> [UUID] -> [Remote] -> Annex ([Remote], [UUID]) +remoteLocations' (IncludeIgnored ii) locations trusted rs = do + let validtrustedlocations = nub locations `intersect` trusted + + -- remotes that match uuids that have the key + allremotes <- if not ii + then filterM (not <$$> liftIO . getDynamicConfig . remoteAnnexIgnore . gitconfig) rs + else return rs + let validremotes = remotesWithUUID allremotes locations + + return (sortBy (comparing cost) validremotes, validtrustedlocations) + +{- Filters a list of remotes to ones that have the listed uuids. -} +remotesWithUUID :: [Remote] -> [UUID] -> [Remote] +remotesWithUUID rs us = filter (\r -> uuid r `elem` us) rs + +{- Filters a list of remotes to ones that do not have the listed uuids. -} +remotesWithoutUUID :: [Remote] -> [UUID] -> [Remote] +remotesWithoutUUID rs us = filter (\r -> uuid r `notElem` us) rs diff --git a/Remote/Web.hs b/Remote/Web.hs index 4728a64c6a..a097782efe 100644 --- a/Remote/Web.hs +++ b/Remote/Web.hs @@ -30,7 +30,6 @@ import Annex.SpecialRemote.Config import Logs.Remote import Logs.EquivilantKeys import Backend -import Backend.VURL.Utilities (generateEquivilantKey) import qualified Data.Map as M @@ -169,18 +168,8 @@ downloadKey urlincludeexclude key _af dest p vc = | otherwise = return (Just v) recordvurlkey eks = do - -- Make sure to pick a backend that is cryptographically - -- secure. - db <- defaultBackend - let b = if isCryptographicallySecure db - then db - else defaultHashBackend - generateEquivilantKey b dest >>= \case - Nothing -> return Nothing - Just ek -> do - unless (ek `elem` eks) $ - setEquivilantKey key ek - return (Just Verified) + b <- hashBackend + updateEquivilantKeys b dest key eks uploadKey :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex () uploadKey _ _ _ _ = giveup "upload to web not supported" diff --git a/Types/GitConfig.hs b/Types/GitConfig.hs index 255778387f..eeae1a0c7e 100644 --- a/Types/GitConfig.hs +++ b/Types/GitConfig.hs @@ -146,6 +146,7 @@ data GitConfig = GitConfig , annexAllowedUrlSchemes :: S.Set Scheme , annexAllowedIPAddresses :: String , annexAllowUnverifiedDownloads :: Bool + , annexAllowedComputePrograms :: Maybe String , annexMaxExtensionLength :: Maybe Int , annexMaxExtensions :: Maybe Int , annexJobs :: Concurrency @@ -261,6 +262,8 @@ extractGitConfig configsource r = GitConfig getmaybe (annexConfig "security.allowed-http-addresses") -- old name , annexAllowUnverifiedDownloads = (== Just "ACKTHPPT") $ getmaybe (annexConfig "security.allow-unverified-downloads") + , annexAllowedComputePrograms = + getmaybe (annexConfig "security.allowed-compute-programs") , annexMaxExtensionLength = getmayberead (annexConfig "maxextensionlength") , annexMaxExtensions = getmayberead (annexConfig "maxextensions") , annexJobs = fromMaybe NonConcurrent $ diff --git a/Upgrade/V0.hs b/Upgrade/V0.hs index ea8c8e7de9..a5cf83e36e 100644 --- a/Upgrade/V0.hs +++ b/Upgrade/V0.hs @@ -25,8 +25,7 @@ upgrade = do olddir <- fromRepo gitAnnexDir keys <- getKeysPresent0 olddir forM_ keys $ \k -> - moveAnnex k (AssociatedFile Nothing) - (olddir toOsPath (keyFile0 k)) + moveAnnex k (olddir toOsPath (keyFile0 k)) -- update the symlinks to the key files -- No longer needed here; V1.upgrade does the same thing diff --git a/Upgrade/V1.hs b/Upgrade/V1.hs index b9ae3af8a8..d0aaba73a3 100644 --- a/Upgrade/V1.hs +++ b/Upgrade/V1.hs @@ -85,7 +85,7 @@ moveContent = do let d = parentDir f liftIO $ allowWrite d liftIO $ allowWrite f - _ <- moveAnnex k (AssociatedFile Nothing) f + _ <- moveAnnex k f liftIO $ removeDirectory d updateSymlinks :: Annex () diff --git a/doc/design/compute_special_remote_interface.mdwn b/doc/design/compute_special_remote_interface.mdwn index 5c771c17ad..0dfd93e314 100644 --- a/doc/design/compute_special_remote_interface.mdwn +++ b/doc/design/compute_special_remote_interface.mdwn @@ -23,14 +23,17 @@ that is in the form "foo=bar" will also result in an environment variable being set, eg `ANNEX_COMPUTE_passes=10` or `ANNEX_COMPUTE_--level=9`. For security, the program should avoid exposing user input to the shell -unprotected, or otherwise executing it. +unprotected, or otherwise executing it. And when running a command, make +sure that whatever user input is passed to it can result in only safe and +expected behavior. The program is run in a temporary directory, which will be cleaned up after -it exits. +it exits. Note that it may be run in a subdirectory of a temporary +directory. This is done when `git-annex addcomputed` was run in a subdirectory +of the git repository. -The content of any annexed file in the repository can be an input -to the computation. The program requests an input by writing a line to -stdout: +The content of any file in the repository can be an input to the +computation. The program requests an input by writing a line to stdout: INPUT file.raw @@ -38,8 +41,8 @@ Then it can read a line from stdin, which will be the path to the content (eg a `.git/annex/objects/` path). If the program needs multiple input files, it should output multiple -`INPUT` lines at once, and then read multiple paths from stdin. This -allows retrival of the inputs to potentially run in parallel. +`INPUT` lines first, and then read multiple paths from stdin. This +allows retrieval of the inputs to potentially run in parallel. If an input file is not available, the program's stdin will be closed without a path being written to it. So when reading from stdin fails, @@ -90,16 +93,17 @@ An example `git-annex-compute-foo` shell script follows: #!/bin/sh set -e if [ "$1" != "convert" ]; then - echo "Usage: convert input output [passes=n]" >&2 - exit 1 + echo "Usage: convert input output [passes=n]" >&2 + exit 1 fi - if [ -z "$ANNEX_COMPUTE_passes" ]; - ANNEX_COMPUTE_passes=1 + if [ -z "$ANNEX_COMPUTE_passes" ]; then + ANNEX_COMPUTE_passes=1 fi - echo "INPUT "$2" + echo "INPUT $2" read input echo "OUTPUT $3" echo REPRODUCIBLE if [ -n "$input" ]; then - frobnicate --passes="$ANNEX_COMPUTE_passes" <"$input" >"$3" + mkdir -p "$(dirname "$3")" + frobnicate --passes="$ANNEX_COMPUTE_passes" <"$input" >"$3" fi diff --git a/doc/git-annex-addcomputed.mdwn b/doc/git-annex-addcomputed.mdwn new file mode 100644 index 0000000000..faff1d96b6 --- /dev/null +++ b/doc/git-annex-addcomputed.mdwn @@ -0,0 +1,103 @@ +# NAME + +git-annex addcomputed - adds computed files to the repository + +# SYNOPSIS + +git annex addcomputed `--to=remote -- ...` + +# DESCRIPTION + +Adds files to the annex that are computed from input files in the +repository, using a compute special remote. + +Once a file has been added to a compute remote, commands +like `git-annex get` will use it to compute the content of the file. + +The syntax of this command after the `--` is up to the program that +the compute special remote is set up to run to perform the comuptation. + +To see the program's usage, you can run: + + git-annex addcomputed --to=foo + +Generally you will provide an input file (or files), and often also an +output filename, and additional parameters to control the computation. + +There can be more than one input file that are combined to compute an +output file. And multiple output files can be computed at the same time. + +Some examples of how this might look: + + git-annex addcomputed --to=x -- convert file.raw file.jpeg passes=10 + git-annex addcomputed --to=y -- compress foo --level=9 + git-annex addcomputed --to=z -- clip foo 2:01-3:00 combine with bar to baz + +Note that parameters that were passed to `git-annex initremote` +when setting up the compute special remote will be appended to the end of +the parameters provided to `git-annex addcomputed`. + +# OPTIONS + +* `--to=remote` + + Specify which remote will compute the files. + + This must be a compute remote. For example, one can be + initialized with: + + git-annex initremote photoconv type=compute \ + program=git-annex-compute-photoconv + + For details about compute remotes, and a list of some + of the programs that are available, see + + +* `--fast` + + Adds computed files to the repository, without doing any work yet to + compute their content. + + This implies `--unreproducible`, because even if the compute remote + produces reproducible output, it's not available. + +* `--unreproducible`, `-u` + + Indicate that the computation is not expected to be fully reproducible. + It can vary, in ways that produce files that equivilant enough to + be interchangeable, but are not necessarily identical. + + This is the default unless the compute remote indicates that it produces + reproducible output. + +* `--reproducible`, `-r` + + Indicate that the computation is expected to be fully reproducible. + + This is the default when the compute remote indicates that it produces + reproducible output (except when using `--fast`). + + If a computation turns out not to be fully reproducible, then getting + a computed file from the compute remote will later fail with a + checksum verification error. One thing that can be done then is to use + `git-annex recompute --original --unreproducible`. + +* `--backend` + + Specifies which key-value backend to use. + +* Also the [[git-annex-common-options]](1) can be used. + +# SEE ALSO + +[[git-annex]](1) + +[[git-annex-recompute]](1) + +[[git-annex-initremote]](1) + +# AUTHOR + +Joey Hess + +Warning: Automatically converted into a man page by mdwn2man. Edit with care. diff --git a/doc/git-annex-initremote.mdwn b/doc/git-annex-initremote.mdwn index 0e4514e823..bcb3494b7f 100644 --- a/doc/git-annex-initremote.mdwn +++ b/doc/git-annex-initremote.mdwn @@ -52,7 +52,7 @@ want to use `git annex renameremote`. git annex initremote mys3 type=S3 --whatelse - For a machine-readable list of the parameters, use this with --json. + For a machine-readable list of the parameters, use this with `--json`. * `--fast` diff --git a/doc/git-annex-recompute.mdwn b/doc/git-annex-recompute.mdwn new file mode 100644 index 0000000000..498c85e26c --- /dev/null +++ b/doc/git-annex-recompute.mdwn @@ -0,0 +1,69 @@ +# NAME + +git-annex recompute - recompute computed files + +# SYNOPSIS + +git-annex recompute [path ...]` + +# DESCRIPTION + +This updates computed files that were added with +[[git-annex-addcomputed]](1). + +By default, this only recomputes files whose input files have changed. +The new contents of the input files are used to re-run the computation. + +When the output of the computation is different, the computed file is +updated with the new content. The updated file is written to the worktree, +but is not staged, in order to avoid overwriting any staged changes. + +# OPTIONS + +* `--original` + + Re-run the computation with the original input files. + +* `--remote=name` + + Only recompute files that were computed by this compute remote. + + When this option is not used, all computed files are recomputed using + whatever compute remote was originally used to add them. (In cases where + a file can be computed by multiple remotes, the one with the lowest + configured cost is used.) + +* `--unreproducible`, `-u` + + Indicate that the computation is not expected to be fully reproducible. + It can vary, in ways that produce files that equivilant enough to + be interchangeable, but are not necessarily identical. + + This is the default unless the compute remote indicates that it produces + reproducible output. + +* `--reproducible`, `-r` + + Indicate that the computation is expected to be fully reproducible. + + This is the default when the compute remote indicates that it produces + reproducible output. + +* matching options + + The [[git-annex-matching-options]](1) can be used to control what + files to recompute. + +* Also the [[git-annex-common-options]](1) can be used. + +# SEE ALSO + +[[git-annex]](1) + +[[git-annex-addcomputed]](1) + +# AUTHOR + +Joey Hess + +Warning: Automatically converted into a man page by mdwn2man. Edit with care. diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 620aee61cd..5a39aa3bfa 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -186,6 +186,18 @@ content from the key-value store. See [[git-annex-undo]](1) for details. +* `addcomputed` + + Adds computed files to the repository. + + See [[git-annex-addcomputed]](1) for details. + +* `recompute` + + Recomputes computed files. + + See [[git-annex-recompute]](1) for details. + * `multicast` Multicast file distribution. @@ -1945,6 +1957,11 @@ Remotes are configured using these settings in `.git/config`. the location of the borg repository to use. Normally this is automatically set up by `git annex initremote`, but you can change it if needed. +* `remote..annex-compute` + + Used to identify compute special remotes. + Normally this is automatically set up by `git annex initremote`. + * `remote..annex-ddarrepo` Used by ddar special remotes, this configures @@ -2184,6 +2201,13 @@ Remotes are configured using these settings in `.git/config`. Per-remote configuration of annex.security.allow-unverified-downloads. +* `annex.security.allowed-compute-programs` + + This is a space separated list of compute programs eg + "git-annex-compute-foo git-annex-compute-bar". Listing a compute + program here allows compute special remotes that use that program to be + enabled by `git-annex enableremote` or autoenabled. + # CONFIGURATION OF ASSISTANT * `annex.delayadd` diff --git a/doc/special_remotes.mdwn b/doc/special_remotes.mdwn index 04f2feb9c6..0c4ff0131f 100644 --- a/doc/special_remotes.mdwn +++ b/doc/special_remotes.mdwn @@ -11,6 +11,7 @@ the content of files. * [[Amazon_Glacier|glacier]] * [[bittorrent]] * [[bup]] +* [[compute]] * [[ddar]] * [[directory]] * [[gcrypt]] (encrypted git repositories!) diff --git a/doc/special_remotes/compute.mdwn b/doc/special_remotes/compute.mdwn new file mode 100644 index 0000000000..264cec825a --- /dev/null +++ b/doc/special_remotes/compute.mdwn @@ -0,0 +1,37 @@ +While other remotes store the contents of annexed files somewhere, +this special remote uses a program to compute the contents of annexed +files. + +To add a file to a compute special remote, use the [[git-annex-addcomputed]] +command. Once a file has been added to a compute special remote, commands +like `git-annex get` will use it to compute the content of the file. + +To enable an instance of this special remote: + + # git-annex initremote myremote type=compute program=git-annex-compute-foo + +The `program` parameter is the only required parameter. It is the name of the +program to use to compute the contents of annexed files. It must start with +"git-annex-compute-". The program needs to be installed somewhere in the +`PATH`. + +Any program can be passed to `git-annex initremote`. However, when enabling +a compute special remote later with `git-annex enableremote` or due to +"autoenable=true", the program must be listed in the git config +`annex.security.allowed-compute-programs`. + +All other "field=value" parameters passed to `initremote` will be passed +to the program when running [[git-annex-addcomputed]]. Note that when the +program takes a dashed option, it can be provided after "--": + + # git-annex initremote myremote type=compute program=git-annex-compute-foo -- --level=9 + +## compute programs + +To write programs used by the compute special remote, see the +[[design/compute_special_remote_interface]]. + +Have you written a generally useful (and secure) compute program? +List it here! + +* ... diff --git a/doc/todo/compute_special_remote/comment_21_2546562f7a00e082cd0500debc904cf3._comment b/doc/todo/compute_special_remote/comment_21_2546562f7a00e082cd0500debc904cf3._comment new file mode 100644 index 0000000000..1416d77bde --- /dev/null +++ b/doc/todo/compute_special_remote/comment_21_2546562f7a00e082cd0500debc904cf3._comment @@ -0,0 +1,22 @@ +[[!comment format=mdwn + username="joey" + subject="""Re: DataLad exploration of the compute on demand space""" + date="2025-03-06T17:39:04Z" + content=""" +Thanks for explaining the design points of datalad-remake. Some +different design choices than I have made, but mostly they strike me as +implementing what is easier/possible from outside git-annex. + +Eg, storing the compute inputs under `.datalad` in the branch is fine -- +and might even be useful if you want to make a branch that changes +something in there -- but of course in the git-annex implementation it +stores the equvilant thing in the git-annex branch. + +I do hope I'm not closing off the design space from such differences +by dropping a compute special remote right into git-annex. But I also +expect that having a standard and easy way for at least simple +computations will lead to a lot of contributions as others use it. + +Your fMRI case seems like one that my compute remote could handle well +and easily. +"""]] diff --git a/doc/todo/compute_special_remote/comment_22_d1561153a3916411ed8caa92fa53893c._comment b/doc/todo/compute_special_remote/comment_22_d1561153a3916411ed8caa92fa53893c._comment new file mode 100644 index 0000000000..bfacbdf57d --- /dev/null +++ b/doc/todo/compute_special_remote/comment_22_d1561153a3916411ed8caa92fa53893c._comment @@ -0,0 +1,69 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 22""" + date="2025-03-06T17:54:50Z" + content=""" +I've merged the compute special remote now. +See [[special_remotes/compute]], [[git-annex-addcomputed]] +and [[git-annex-recompute]]. + +I have opened [[todo/compute_special_remote_remaining_todos]] with +some various ways that I want to improve it further. Including, notably, +computing on inputs from submodules, which is not currently supported at +all. + +---- + +Here I'll go down mih's original and quite useful design criteria and see +how the compute special remote applies to them: + +### Generate annex keys (that have never existed) + +`git-annex addcomputed --fast` + +### Re-generate annex keys + +`git-annex addcomputed` optionally with the --reproducible option, +followed by a later `git-annex get` + +Another thing that fits under this heading is when one of the original +input files has gotten modified, and you want to compute a new version of +the output file from it, using the same method as was used to compute it +before. That's `git-annex recompute $output_file` + +### Worktree provisioning? + +This is the main thing I didn't implement. Given that git-annex is working +with large files and needs to support various filesystems and OS's that +lack hardlinks and softlinks, it's hard to do this inexpensively. + +Also, it turned out to make sense for the compute program to request +the input files it needs, since this lets git-annex learn what the input +files are, so it can make them available when regenerating a computed file +later. And so the protocol just has git-annex respond with the path to +the content of the file. + +### Request one key, receive many + +This is supported. (So is using multiple inputs to produce one (or more) +outputs.) + +### Instruction deposition + +`git-annex addcomputed` + +### Storage redundancy tests + +It did make sense to have it automatically `git-annex get` the inputs. +Well, I think it makes sense in most cases, this may become a tunable +setting of the compute special remote. + +### Trust + +Handled by requiring the user install a `git-annex-compute-foo` command +in PATH, and provide the name of the command to `initremote`. + +And for later `enableremote` or `autoenable=true`, it will only +allow programs that are listed in the annex.security.allowed-compute-programs +git config. +"""]] diff --git a/doc/todo/compute_special_remote_remaining_todos.mdwn b/doc/todo/compute_special_remote_remaining_todos.mdwn new file mode 100644 index 0000000000..bb522398a4 --- /dev/null +++ b/doc/todo/compute_special_remote_remaining_todos.mdwn @@ -0,0 +1,69 @@ +This is the remainder of my todo list while I was building the +compute special remote. --[[Joey]] + +* write a tip showing how to use this + +* Write some simple compute programs so we have something to start with. + + - convert between images eg jpeg to png + - run a command in a singularity container (that is one of the inputs) + - run a wasm binary (that is one of the inputs) + +* compute on input files in submodules + +* annex.diskreserve can be violated if getting a file computes it but also + some other output files, which get added to the annex. + +* would be nice to have a way to see what computations are used by a + compute remote for a file. Put it in `whereis` output? But it's not an + url. Maybe a separate command? That would also allow querying for eg, + what files are inputs for another file. Or it could be exposed in the + Remote interface, and made into a file matching option. + +* "getting input from " message uses the original filename, + but that file might have been renamed. Would be more clear to use + whatever file in the tree currently points to the key it's getting + (what if there is not one?) + +* allow git-annex enableremote with program= explicitly specified, + without checking annex.security.allowed-compute-programs + +* addcomputed should honor annex.addunlocked. + + What about recompute? It seems it should either write the new version of + the file as an unlocked file when the old version was unlocked, or also + honor annex.addunlocked. + + Problem: Since recompute does not stage the file, it would have to write + the content to the working tree. And then the user would need to + git-annex add. But then, if the key was a VURL key, it would add it with + the default backend instead, and the file would no longer use a computed + key. + + So it, seems that, for this to be done, recompute would need to stage the + pointer file. + +* recompute could ingest keys for other files than the one being + recomputed, and remember them. Then recomputing those files could just + use those keys, without re-running a computation. (Better than --others + which got removed.) + +* `git-annex recompute foo bar baz`, when foo depends on bar which depends + on baz, and when baz has changed, will not recompute foo, because bar has + not changed. It then recomputes bar. So running the command again is + needed to recompute foo. + + What it could do is, after it recomputes bar, notice that it already + considered foo, and revisit foo, and recompute it then. It could either + use a bloom filter to remember the files it considered but did not + compute, or it could just notice that the command line includes foo + (or includes a directory that contains foo), and then foo is not + modified. + + Or it could build a DAG and traverse it, but building a DAG of a large + directory tree has its own problems. + +* Should addcomputed honor annex.smallfiles? That would seem to imply + that recompute should also support recomputing non-annexed files. + Otherwise, adding a file and then recomputing it would vary in + what the content of the file is, depending on annex.smallfiles setting. diff --git a/git-annex.cabal b/git-annex.cabal index fae2a3bbb8..2123b73663 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -562,6 +562,7 @@ Executable git-annex Annex.FileMatcher Annex.Fixup Annex.GitOverlay + Annex.GitShaKey Annex.HashObject Annex.Hook Annex.Import @@ -654,6 +655,7 @@ Executable git-annex CmdLine.Usage Command Command.Add + Command.AddComputed Command.AddUnused Command.AddUrl Command.Adjust @@ -727,6 +729,7 @@ Executable git-annex Command.Proxy Command.Pull Command.Push + Command.Recompute Command.ReKey Command.ReadPresentKey Command.RecvKey @@ -930,6 +933,7 @@ Executable git-annex Remote.BitTorrent Remote.Borg Remote.Bup + Remote.Compute Remote.Ddar Remote.Directory Remote.Directory.LegacyChunked