diff --git a/Remote/Compute.hs b/Remote/Compute.hs index 3fd52af5c8..18ebe950f7 100644 --- a/Remote/Compute.hs +++ b/Remote/Compute.hs @@ -10,6 +10,7 @@ module Remote.Compute (remote) where import Annex.Common import Types.Remote import Types.ProposedAccepted +import Types.MetaData import Types.Creds import Config import Config.Cost @@ -17,24 +18,30 @@ import Remote.Helper.Special import Remote.Helper.ExportImport import Annex.SpecialRemote.Config import Annex.UUID -import Logs.RemoteState +import Logs.MetaData import Utility.Metered +import Utility.Hash +import Utility.TimeStamp +import Git.FilePath import qualified Git import qualified Utility.SimpleProtocol as Proto import Control.Concurrent.STM +import Data.Time.Clock +import Data.Either +import Data.Char +import Data.Ord import qualified Data.Map as M import qualified Data.Set as S +import qualified Data.Text as T +import qualified Data.Text.Encoding as T remote :: RemoteType remote = RemoteType { typename = "compute" , enumerate = const $ findSpecialRemotes "compute" , generate = gen - , configParser = mkRemoteConfigParser - [ optionalStringParser programField - (FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")") - ] + , configParser = computeConfigParser , setup = setupInstance , exportSupported = exportUnsupported , importSupported = importUnsupported @@ -55,13 +62,13 @@ gen r u rc gc rs = case getComputeProgram rc of , cost = cst , name = Git.repoDescribe r , storeKey = storeKeyUnsupported - , retrieveKeyFile = computeKey program interface + , retrieveKeyFile = computeKey rs program interface , retrieveKeyFileInOrder = pure True , retrieveKeyFileCheap = Nothing , retrievalSecurityPolicy = RetrievalAllKeysSecure , removeKey = dropKey rs , lockContent = Nothing - , checkPresent = checkKey program interface + , checkPresent = checkKey rs program interface , checkPresentCheap = False , exportActions = exportUnsupported , importActions = importUnsupported @@ -93,6 +100,33 @@ setupInstance _ mu _ c _ = do gitConfigSpecialRemote u c [("compute", "true")] return (c, u) +-- The RemoteConfig is allowed to contain fields from the program's +-- interface. That provides defaults for git-annex addcomputed. +computeConfigParser :: RemoteConfig -> Annex RemoteConfigParser +computeConfigParser rc = do + Interface interface <- case getComputeProgram rc of + Left _ -> pure $ Interface [] + Right program -> liftIO (getInterfaceUncached program) >>= return . \case + Left _ -> Interface [] + Right interface -> interface + let m = M.fromList $ mapMaybe collectfields interface + let ininterface f = case toField (fromProposedAccepted f) of + Just f' -> M.member f' m + Nothing -> False + return $ RemoteConfigParser + { remoteConfigFieldParsers = + [ optionalStringParser programField + (FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")") + ] + , remoteConfigRestPassthrough = Just (ininterface, M.toList $ M.mapKeys fromField m) + } + where + collectfields (InterfaceInput f d) = Just (f, FieldDesc d) + collectfields (InterfaceOptionalInput f d) = Just (f, FieldDesc d) + collectfields (InterfaceValue f d) = Just (f, FieldDesc d) + collectfields (InterfaceOptionalValue f d) = Just (f, FieldDesc d) + collectfields _ = Nothing + newtype ComputeProgram = ComputeProgram String deriving (Show) @@ -114,16 +148,17 @@ safetyPrefix = "git-annex-compute-" programField :: RemoteConfigField programField = Accepted "program" -type Name = String type Description = String -type Id = String + +newtype Field = Field MetaField + deriving (Show, Eq, Ord) data InterfaceItem - = InterfaceInput Id Description - | InterfaceOptionalInput Id Description - | InterfaceValue Name Description - | InterfaceOptionalValue Name Description - | InterfaceOutput Id Description + = InterfaceInput Field Description + | InterfaceOptionalInput Field Description + | InterfaceValue Field Description + | InterfaceOptionalValue Field Description + | InterfaceOutput Field Description | InterfaceReproducible deriving (Show, Eq) @@ -141,10 +176,22 @@ instance Proto.Receivable InterfaceItem where parseCommand "OUTPUT" = Proto.parse2 InterfaceOutput parseCommand "REPRODUCIBLE" = Proto.parse0 InterfaceReproducible +instance Proto.Serializable Field where + serialize = fromField + deserialize = toField + +-- While MetaField is case insensitive, environment variable names are not, +-- so make Field always lower cased. +toField :: String -> Maybe Field +toField f = Field <$> toMetaField (T.pack (map toLower f)) + +fromField :: Field -> String +fromField (Field f) = T.unpack (fromMetaField f) + getInterface :: ComputeProgram -> TMVar (Maybe Interface) -> IO (Either String Interface) getInterface program iv = atomically (takeTMVar iv) >>= \case - Nothing -> getInterface' program >>= \case + Nothing -> getInterfaceUncached program >>= \case Left err -> do atomically $ putTMVar iv Nothing return (Left err) @@ -155,8 +202,8 @@ getInterface program iv = atomically $ putTMVar iv (Just interface) return (Right interface) -getInterface' :: ComputeProgram -> IO (Either String Interface) -getInterface' (ComputeProgram program) = +getInterfaceUncached :: ComputeProgram -> IO (Either String Interface) +getInterfaceUncached (ComputeProgram program) = catchMaybeIO (readProcess program ["interface"]) >>= \case Nothing -> return $ Left $ "Failed to run " ++ program Just output -> return $ case parseInterface output of @@ -179,44 +226,168 @@ data ComputeInput = ComputeInput Key FilePath deriving (Show, Eq) data ComputeValue = ComputeValue String + deriving (Show, Eq) + +data ComputeOutput = ComputeOutput Key + deriving (Show, Eq) data ComputeState = ComputeState - { computeInputs :: M.Map Id ComputInput - , computeValues :: M.Map Id ComputeValue + { computeInputs :: M.Map Field ComputeInput + , computeValues :: M.Map Field ComputeValue + , computeOutputs :: M.Map Field ComputeOutput + , computeTimeEstimate :: NominalDiffTime } deriving (Show, Eq) --- The state is URI encoded. +-- Generates a hash of a ComputeState. -- --- A ComputeValue with Id "foo" is represented as "vfoo=value" --- A ComputeInput with Id "foo" is represented as "kfoo=key&pfoo=path" -formatComputeState :: ComputeState -> String -formatComputeState st = - map formatinput (computeInputes st) - ++ concatMap formatvalue (computeValues st) +-- This is used as a short unique identifier in the metadata fields, +-- since more than one ComputeState may be stored in the compute remote's +-- metadata for a given Key. +-- +-- A md5 is fine for this. It does not need to protect against intentional +-- collisions. And 2^64 is a sufficiently small chance of accidental +-- collision. +hashComputeState :: ComputeState -> String +hashComputeState state = show $ md5s $ + mconcat (map (go goi) (M.toAscList (computeInputs state))) + <> + mconcat (map (go gov) (M.toAscList (computeValues state))) + <> + mconcat (map (go goo) (M.toAscList (computeOutputs state))) + <> + encodeBS (show (computeTimeEstimate state)) + where + go c (Field f, v) = T.encodeUtf8 (fromMetaField f) <> c v + goi (ComputeInput k f) = serializeKey' k <> encodeBS f + gov (ComputeValue s) = encodeBS s + goo (ComputeOutput k) = serializeKey' k -parseComputeState :: String -> ComputeState -parseComputeState = +computeStateMetaData :: ComputeState -> MetaData +computeStateMetaData = undefined --- TODO -computeKey :: ComputeProgram -> TMVar (Maybe Interface) -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification -computeKey program iv key _af dest p vc = +-- FIXME: Need to unswizzle the mixed up metadata based on hash prefixes. +metaDataComputeStates :: MetaData -> [ComputeState] +metaDataComputeStates (MetaData m) = + go (ComputeState mempty mempty mempty 0) (M.toList m) + where + go c ((f,v):rest) = + let c' = case T.unpack (fromMetaField f) of + ('i':'n':'p':'u':'t':'-':f') -> case M.lookup m =<< toMetaField (T.pack ("key-" ++ f')) of + Nothing -> c + Just kv -> case deserializeKey' (fromMetaValue kv) of + Just k -> c + { computeInputs = + M.insert (toField f) + (ComputeInput k (decodeBS (fromMetaValue v))) + (computeOutputs c) + } + Nothing -> c + ('v':'a':'l':'u':'e':'-':f') -> c + { computeValues = + M.insert (toField f) + (ComputeValue (decodeBS (fromMetaValue v))) + (computeValues c) + } + ('o':'u':'t':'p':'u':'t':'-':f') -> + case deserializeKey' (fromMetaValue v) of + Just k -> c + { computeOutputs = + M.insert (toField f) + (ComputeOutput k) + (computeOutputs c) + } + Nothing -> c + ('t':'i':'m':'e':'-':f') -> + case parsePOSIXTime (fromMetaValue v) of + Just t -> c { computeTimeEstimate = t } + Nothing -> c + _ -> c + in go c' rest + +getComputeStates :: RemoteStateHandle -> Key -> Annex [ComputeState] +getComputeStates rs k = do + RemoteMetaData _ m <- getCurrentRemoteMetaData rs k + return (metaDataComputeStates m) + +setComputeState :: RemoteStateHandle -> Key -> ComputeState -> Annex () +setComputeState rs k st = addRemoteMetaData k rs (computeStateMetaData st) + +{- Finds the first compute state that provides everything required by the + - interface, and returns a list of what should be provided to the program + - in its environment. + -} +interfaceEnv :: [ComputeState] -> Interface -> Either String [(String, Either Key String)] +interfaceEnv states interface = go Nothing states + where + go (Just firsterr) [] = Left firsterr + go Nothing [] = interfaceEnv' (ComputeState mempty mempty mempty 0) interface + go firsterr (state:rest) = case interfaceEnv' state interface of + Right v -> Right v + Left e + | null rest -> Left (fromMaybe e firsterr) + | otherwise -> go (firsterr <|> Just e) rest + +interfaceEnv' :: ComputeState -> Interface -> Either String [(String, Either Key String)] +interfaceEnv' state (Interface interface) = + case partitionEithers (mapMaybe go interface) of + ([], env) -> Right $ + map (\(f, v) -> (fromField f, v)) env + (problems, _) -> Left $ unlines problems + where + go (InterfaceInput name desc) = + case M.lookup name (computeInputs state) of + Just (ComputeInput key _file) -> Just $ + Right (name, Left key) + Nothing -> Just $ + Left $ "Missing required input \"" ++ fromField name ++ "\" -- " ++ desc + go (InterfaceOptionalInput name desc) = + case M.lookup name (computeInputs state) of + Just (ComputeInput key _file) -> Just $ + Right (name, Left key) + Nothing -> Nothing + go (InterfaceValue name desc) = + case M.lookup name (computeValues state) of + Just (ComputeValue v) -> Just $ + Right (name, Right v) + nothing -> Just $ + Left $ "Missing required value \"" ++ fromField name ++ "\" -- " ++ desc + go (InterfaceOptionalValue name desc) = + case M.lookup name (computeValues state) of + Just (ComputeValue v) -> Just $ + Right (name, Right v) + Nothing -> Nothing + go (InterfaceOutput _ _) = Nothing + go InterfaceReproducible = Nothing + +computeKey :: RemoteStateHandle -> ComputeProgram -> TMVar (Maybe Interface) -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification +computeKey rs program iv k _af dest p vc = liftIO (getInterface program iv) >>= \case Left err -> giveup err - Right interface -> undefined + Right interface -> do + states <- sortBy (comparing computeTimeEstimate) + <$> getComputeStates rs k + case interfaceEnv states interface of + Left err -> giveup err + Right ienv -> undefined -- TODO --- TODO Make sure that the remote state meets the program's current --- interface. -checkKey :: ComputeProgram -> TMVar (Maybe Interface) -> Key -> Annex Bool -checkKey program iv _ = +-- Make sure that the compute state has everything needed by +-- the program's current interface. +checkKey :: RemoteStateHandle -> ComputeProgram -> TMVar (Maybe Interface) -> Key -> Annex Bool +checkKey rs program iv k = do + states <- getComputeStates rs k liftIO (getInterface program iv) >>= \case Left err -> giveup err - Right interface -> undefined + Right interface -> + case interfaceEnv states interface of + Right _ -> return True + Left _ -> return False --- Removing remote state will prevent computing the key. +-- Unsetting the compute state will prevent computing the key. dropKey :: RemoteStateHandle -> Maybe SafeDropProof -> Key -> Annex () -dropKey rs _ k = setRemoteState rs k mempty +dropKey rs _ k = do + RemoteMetaData _ old <- getCurrentRemoteMetaData rs k + addRemoteMetaData k rs (modMeta old DelAllMeta) storeKeyUnsupported :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex () storeKeyUnsupported _ _ _ _ = giveup "transfer to compute remote not supported; use git-annex addcomputed instead" - diff --git a/doc/design/compute_special_remote_interface.mdwn b/doc/design/compute_special_remote_interface.mdwn index 8b1a732e7a..707c65d742 100644 --- a/doc/design/compute_special_remote_interface.mdwn +++ b/doc/design/compute_special_remote_interface.mdwn @@ -79,14 +79,19 @@ outputs. The output is lines, in the form: - INPUT[?] Id Description - VALUE[?] Id Description + INPUT[?] Name Description + VALUE[?] Name Description OUTPUT Id Description Use "INPUT" when a file is an input to the computation, and "VALUE" for all other input values. Use "INPUT?" and "VALUE?" for optional inputs and values. +Note that the Name and Id both have to be legal git-annex metadata field +names. And should be lower cased. The user is allowed to use any case +for the names when providing inputs and values to `git-annex addcomputed` +though. + The interface can also optionally include a "REPRODUCIBLE" line. That indicates that the results of its computations are expected to be bit-for-bit reproducible. diff --git a/doc/todo/compute_special_remote/comment_18_021858e8032eca84488ec2324ec25a6f._comment b/doc/todo/compute_special_remote/comment_18_021858e8032eca84488ec2324ec25a6f._comment new file mode 100644 index 0000000000..4740ce806f --- /dev/null +++ b/doc/todo/compute_special_remote/comment_18_021858e8032eca84488ec2324ec25a6f._comment @@ -0,0 +1,13 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 18""" + date="2025-02-19T18:29:58Z" + content=""" +I've started a `compute` branch which so far has documentation for +the [compute special remote](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/special_remotes/compute.mdwn;hb=refs/heads/compute), +[git-annex addcomputed](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/git-annex-addcomputed.mdwn;hb=refs/heads/compute), +and +[git-annex recompute](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/git-annex-recompute.mdwn;hb=refs/heads/compute) + +I am pretty happy with how this design is shaping up. +"""]] diff --git a/doc/todo/compute_special_remote/comment_19_fcba8049e659d3238b9f83286777f71f._comment b/doc/todo/compute_special_remote/comment_19_fcba8049e659d3238b9f83286777f71f._comment new file mode 100644 index 0000000000..e4631c5a39 --- /dev/null +++ b/doc/todo/compute_special_remote/comment_19_fcba8049e659d3238b9f83286777f71f._comment @@ -0,0 +1,69 @@ +[[!comment format=mdwn + username="joey" + subject="""open questions""" + date="2025-02-19T18:39:41Z" + content=""" +One thing that I am unsure about is what should happen if `git-annex get foo` +needs the content of file `bar`, which is not present. Should it get `bar` from +a remote? Or should it fail to get `foo`? + +Consider that, in the case of `git-annex get foo --from computeremote`, the +user has asked it to get a file from that particular remote, not from +whatever remote contains `bar`. + +If the same compute remote can also compute `bar`, it seems quite reasonable +for `git-annex get foo --from computeremote` to also compute bar. (This is +similar to a single computation that generates two output files, in which +case getting one of them will get both of them.) + +And it seems reasonable for `git-annex get foo` with no specified remote +to also get or compute bar, from whereever. + +But, there is no way at the level of a special remote to tell the +difference between those two commands. + +Maybe the right answer is to define getting a file from a compute +special remote as including getting its inputs from other remotes. +Preferring getting them from the same compute special remote when possible, +and when not, using the lowest cost remote that works, same as `git-annx +get` does. + +Or this could be a configuration of the compute special remote. Maybe some +would want to always get source files, and others would want to never get +source files? + +---- + +A related problem is that, `foo` might be fairly small, but `bar` very +large. So getting a small object can require getting or generating other +large objects. Getting `bar` might fail because there is not enough space +to meet annex.diskreserve. Or the user might just be surprised that so much +disk space was eaten up. But dropping `bar` after computing `foo` also +doesn't seem like a good idea; the user might want to hang onto their copy +now that they have it, or perhaps move it to some faster remote. + +Maybe preferred content is the solution? After computing `foo` with `bar`, +keep the copy of `bar` if the local repository wants it, drop it otherwise. + +---- + +Progress display is also going to be complicated for this. There is no +way in the special remote interface to display the progress for `bar` +while getting `foo`. + +Probably the thing to do would be to add together the sizes of both files, +and display a combined progress meter. +It would be ok to not say when it's getting the input file. +This will need a way to set the size for a progress display to larger +than the size of the key. + +---- + +.... All 3 problems above go away if it doesn't automatically get input files +before computations and the computations instead just fail with an error +saying the input file is not present. + +But then consider the case where you just want every file in the repository. +`git-annex get .` failing to compute some files because their input files +happen to come after them in the directory listing is not good. +"""]]