This commit is contained in:
Joey Hess 2025-02-20 13:29:05 -04:00
commit e897229088
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
4 changed files with 300 additions and 42 deletions

View file

@ -10,6 +10,7 @@ module Remote.Compute (remote) where
import Annex.Common import Annex.Common
import Types.Remote import Types.Remote
import Types.ProposedAccepted import Types.ProposedAccepted
import Types.MetaData
import Types.Creds import Types.Creds
import Config import Config
import Config.Cost import Config.Cost
@ -17,24 +18,30 @@ import Remote.Helper.Special
import Remote.Helper.ExportImport import Remote.Helper.ExportImport
import Annex.SpecialRemote.Config import Annex.SpecialRemote.Config
import Annex.UUID import Annex.UUID
import Logs.RemoteState import Logs.MetaData
import Utility.Metered import Utility.Metered
import Utility.Hash
import Utility.TimeStamp
import Git.FilePath
import qualified Git import qualified Git
import qualified Utility.SimpleProtocol as Proto import qualified Utility.SimpleProtocol as Proto
import Control.Concurrent.STM import Control.Concurrent.STM
import Data.Time.Clock
import Data.Either
import Data.Char
import Data.Ord
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
remote :: RemoteType remote :: RemoteType
remote = RemoteType remote = RemoteType
{ typename = "compute" { typename = "compute"
, enumerate = const $ findSpecialRemotes "compute" , enumerate = const $ findSpecialRemotes "compute"
, generate = gen , generate = gen
, configParser = mkRemoteConfigParser , configParser = computeConfigParser
[ optionalStringParser programField
(FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")")
]
, setup = setupInstance , setup = setupInstance
, exportSupported = exportUnsupported , exportSupported = exportUnsupported
, importSupported = importUnsupported , importSupported = importUnsupported
@ -55,13 +62,13 @@ gen r u rc gc rs = case getComputeProgram rc of
, cost = cst , cost = cst
, name = Git.repoDescribe r , name = Git.repoDescribe r
, storeKey = storeKeyUnsupported , storeKey = storeKeyUnsupported
, retrieveKeyFile = computeKey program interface , retrieveKeyFile = computeKey rs program interface
, retrieveKeyFileInOrder = pure True , retrieveKeyFileInOrder = pure True
, retrieveKeyFileCheap = Nothing , retrieveKeyFileCheap = Nothing
, retrievalSecurityPolicy = RetrievalAllKeysSecure , retrievalSecurityPolicy = RetrievalAllKeysSecure
, removeKey = dropKey rs , removeKey = dropKey rs
, lockContent = Nothing , lockContent = Nothing
, checkPresent = checkKey program interface , checkPresent = checkKey rs program interface
, checkPresentCheap = False , checkPresentCheap = False
, exportActions = exportUnsupported , exportActions = exportUnsupported
, importActions = importUnsupported , importActions = importUnsupported
@ -93,6 +100,33 @@ setupInstance _ mu _ c _ = do
gitConfigSpecialRemote u c [("compute", "true")] gitConfigSpecialRemote u c [("compute", "true")]
return (c, u) return (c, u)
-- The RemoteConfig is allowed to contain fields from the program's
-- interface. That provides defaults for git-annex addcomputed.
computeConfigParser :: RemoteConfig -> Annex RemoteConfigParser
computeConfigParser rc = do
Interface interface <- case getComputeProgram rc of
Left _ -> pure $ Interface []
Right program -> liftIO (getInterfaceUncached program) >>= return . \case
Left _ -> Interface []
Right interface -> interface
let m = M.fromList $ mapMaybe collectfields interface
let ininterface f = case toField (fromProposedAccepted f) of
Just f' -> M.member f' m
Nothing -> False
return $ RemoteConfigParser
{ remoteConfigFieldParsers =
[ optionalStringParser programField
(FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")")
]
, remoteConfigRestPassthrough = Just (ininterface, M.toList $ M.mapKeys fromField m)
}
where
collectfields (InterfaceInput f d) = Just (f, FieldDesc d)
collectfields (InterfaceOptionalInput f d) = Just (f, FieldDesc d)
collectfields (InterfaceValue f d) = Just (f, FieldDesc d)
collectfields (InterfaceOptionalValue f d) = Just (f, FieldDesc d)
collectfields _ = Nothing
newtype ComputeProgram = ComputeProgram String newtype ComputeProgram = ComputeProgram String
deriving (Show) deriving (Show)
@ -114,16 +148,17 @@ safetyPrefix = "git-annex-compute-"
programField :: RemoteConfigField programField :: RemoteConfigField
programField = Accepted "program" programField = Accepted "program"
type Name = String
type Description = String type Description = String
type Id = String
newtype Field = Field MetaField
deriving (Show, Eq, Ord)
data InterfaceItem data InterfaceItem
= InterfaceInput Id Description = InterfaceInput Field Description
| InterfaceOptionalInput Id Description | InterfaceOptionalInput Field Description
| InterfaceValue Name Description | InterfaceValue Field Description
| InterfaceOptionalValue Name Description | InterfaceOptionalValue Field Description
| InterfaceOutput Id Description | InterfaceOutput Field Description
| InterfaceReproducible | InterfaceReproducible
deriving (Show, Eq) deriving (Show, Eq)
@ -141,10 +176,22 @@ instance Proto.Receivable InterfaceItem where
parseCommand "OUTPUT" = Proto.parse2 InterfaceOutput parseCommand "OUTPUT" = Proto.parse2 InterfaceOutput
parseCommand "REPRODUCIBLE" = Proto.parse0 InterfaceReproducible parseCommand "REPRODUCIBLE" = Proto.parse0 InterfaceReproducible
instance Proto.Serializable Field where
serialize = fromField
deserialize = toField
-- While MetaField is case insensitive, environment variable names are not,
-- so make Field always lower cased.
toField :: String -> Maybe Field
toField f = Field <$> toMetaField (T.pack (map toLower f))
fromField :: Field -> String
fromField (Field f) = T.unpack (fromMetaField f)
getInterface :: ComputeProgram -> TMVar (Maybe Interface) -> IO (Either String Interface) getInterface :: ComputeProgram -> TMVar (Maybe Interface) -> IO (Either String Interface)
getInterface program iv = getInterface program iv =
atomically (takeTMVar iv) >>= \case atomically (takeTMVar iv) >>= \case
Nothing -> getInterface' program >>= \case Nothing -> getInterfaceUncached program >>= \case
Left err -> do Left err -> do
atomically $ putTMVar iv Nothing atomically $ putTMVar iv Nothing
return (Left err) return (Left err)
@ -155,8 +202,8 @@ getInterface program iv =
atomically $ putTMVar iv (Just interface) atomically $ putTMVar iv (Just interface)
return (Right interface) return (Right interface)
getInterface' :: ComputeProgram -> IO (Either String Interface) getInterfaceUncached :: ComputeProgram -> IO (Either String Interface)
getInterface' (ComputeProgram program) = getInterfaceUncached (ComputeProgram program) =
catchMaybeIO (readProcess program ["interface"]) >>= \case catchMaybeIO (readProcess program ["interface"]) >>= \case
Nothing -> return $ Left $ "Failed to run " ++ program Nothing -> return $ Left $ "Failed to run " ++ program
Just output -> return $ case parseInterface output of Just output -> return $ case parseInterface output of
@ -179,44 +226,168 @@ data ComputeInput = ComputeInput Key FilePath
deriving (Show, Eq) deriving (Show, Eq)
data ComputeValue = ComputeValue String data ComputeValue = ComputeValue String
deriving (Show, Eq)
data ComputeOutput = ComputeOutput Key
deriving (Show, Eq)
data ComputeState = ComputeState data ComputeState = ComputeState
{ computeInputs :: M.Map Id ComputInput { computeInputs :: M.Map Field ComputeInput
, computeValues :: M.Map Id ComputeValue , computeValues :: M.Map Field ComputeValue
, computeOutputs :: M.Map Field ComputeOutput
, computeTimeEstimate :: NominalDiffTime
} }
deriving (Show, Eq) deriving (Show, Eq)
-- The state is URI encoded. -- Generates a hash of a ComputeState.
-- --
-- A ComputeValue with Id "foo" is represented as "vfoo=value" -- This is used as a short unique identifier in the metadata fields,
-- A ComputeInput with Id "foo" is represented as "kfoo=key&pfoo=path" -- since more than one ComputeState may be stored in the compute remote's
formatComputeState :: ComputeState -> String -- metadata for a given Key.
formatComputeState st = --
map formatinput (computeInputes st) -- A md5 is fine for this. It does not need to protect against intentional
++ concatMap formatvalue (computeValues st) -- collisions. And 2^64 is a sufficiently small chance of accidental
-- collision.
hashComputeState :: ComputeState -> String
hashComputeState state = show $ md5s $
mconcat (map (go goi) (M.toAscList (computeInputs state)))
<>
mconcat (map (go gov) (M.toAscList (computeValues state)))
<>
mconcat (map (go goo) (M.toAscList (computeOutputs state)))
<>
encodeBS (show (computeTimeEstimate state))
where
go c (Field f, v) = T.encodeUtf8 (fromMetaField f) <> c v
goi (ComputeInput k f) = serializeKey' k <> encodeBS f
gov (ComputeValue s) = encodeBS s
goo (ComputeOutput k) = serializeKey' k
parseComputeState :: String -> ComputeState computeStateMetaData :: ComputeState -> MetaData
parseComputeState = computeStateMetaData = undefined
-- TODO -- FIXME: Need to unswizzle the mixed up metadata based on hash prefixes.
computeKey :: ComputeProgram -> TMVar (Maybe Interface) -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification metaDataComputeStates :: MetaData -> [ComputeState]
computeKey program iv key _af dest p vc = metaDataComputeStates (MetaData m) =
go (ComputeState mempty mempty mempty 0) (M.toList m)
where
go c ((f,v):rest) =
let c' = case T.unpack (fromMetaField f) of
('i':'n':'p':'u':'t':'-':f') -> case M.lookup m =<< toMetaField (T.pack ("key-" ++ f')) of
Nothing -> c
Just kv -> case deserializeKey' (fromMetaValue kv) of
Just k -> c
{ computeInputs =
M.insert (toField f)
(ComputeInput k (decodeBS (fromMetaValue v)))
(computeOutputs c)
}
Nothing -> c
('v':'a':'l':'u':'e':'-':f') -> c
{ computeValues =
M.insert (toField f)
(ComputeValue (decodeBS (fromMetaValue v)))
(computeValues c)
}
('o':'u':'t':'p':'u':'t':'-':f') ->
case deserializeKey' (fromMetaValue v) of
Just k -> c
{ computeOutputs =
M.insert (toField f)
(ComputeOutput k)
(computeOutputs c)
}
Nothing -> c
('t':'i':'m':'e':'-':f') ->
case parsePOSIXTime (fromMetaValue v) of
Just t -> c { computeTimeEstimate = t }
Nothing -> c
_ -> c
in go c' rest
getComputeStates :: RemoteStateHandle -> Key -> Annex [ComputeState]
getComputeStates rs k = do
RemoteMetaData _ m <- getCurrentRemoteMetaData rs k
return (metaDataComputeStates m)
setComputeState :: RemoteStateHandle -> Key -> ComputeState -> Annex ()
setComputeState rs k st = addRemoteMetaData k rs (computeStateMetaData st)
{- Finds the first compute state that provides everything required by the
- interface, and returns a list of what should be provided to the program
- in its environment.
-}
interfaceEnv :: [ComputeState] -> Interface -> Either String [(String, Either Key String)]
interfaceEnv states interface = go Nothing states
where
go (Just firsterr) [] = Left firsterr
go Nothing [] = interfaceEnv' (ComputeState mempty mempty mempty 0) interface
go firsterr (state:rest) = case interfaceEnv' state interface of
Right v -> Right v
Left e
| null rest -> Left (fromMaybe e firsterr)
| otherwise -> go (firsterr <|> Just e) rest
interfaceEnv' :: ComputeState -> Interface -> Either String [(String, Either Key String)]
interfaceEnv' state (Interface interface) =
case partitionEithers (mapMaybe go interface) of
([], env) -> Right $
map (\(f, v) -> (fromField f, v)) env
(problems, _) -> Left $ unlines problems
where
go (InterfaceInput name desc) =
case M.lookup name (computeInputs state) of
Just (ComputeInput key _file) -> Just $
Right (name, Left key)
Nothing -> Just $
Left $ "Missing required input \"" ++ fromField name ++ "\" -- " ++ desc
go (InterfaceOptionalInput name desc) =
case M.lookup name (computeInputs state) of
Just (ComputeInput key _file) -> Just $
Right (name, Left key)
Nothing -> Nothing
go (InterfaceValue name desc) =
case M.lookup name (computeValues state) of
Just (ComputeValue v) -> Just $
Right (name, Right v)
nothing -> Just $
Left $ "Missing required value \"" ++ fromField name ++ "\" -- " ++ desc
go (InterfaceOptionalValue name desc) =
case M.lookup name (computeValues state) of
Just (ComputeValue v) -> Just $
Right (name, Right v)
Nothing -> Nothing
go (InterfaceOutput _ _) = Nothing
go InterfaceReproducible = Nothing
computeKey :: RemoteStateHandle -> ComputeProgram -> TMVar (Maybe Interface) -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification
computeKey rs program iv k _af dest p vc =
liftIO (getInterface program iv) >>= \case liftIO (getInterface program iv) >>= \case
Left err -> giveup err Left err -> giveup err
Right interface -> undefined Right interface -> do
states <- sortBy (comparing computeTimeEstimate)
<$> getComputeStates rs k
case interfaceEnv states interface of
Left err -> giveup err
Right ienv -> undefined -- TODO
-- TODO Make sure that the remote state meets the program's current -- Make sure that the compute state has everything needed by
-- interface. -- the program's current interface.
checkKey :: ComputeProgram -> TMVar (Maybe Interface) -> Key -> Annex Bool checkKey :: RemoteStateHandle -> ComputeProgram -> TMVar (Maybe Interface) -> Key -> Annex Bool
checkKey program iv _ = checkKey rs program iv k = do
states <- getComputeStates rs k
liftIO (getInterface program iv) >>= \case liftIO (getInterface program iv) >>= \case
Left err -> giveup err Left err -> giveup err
Right interface -> undefined Right interface ->
case interfaceEnv states interface of
Right _ -> return True
Left _ -> return False
-- Removing remote state will prevent computing the key. -- Unsetting the compute state will prevent computing the key.
dropKey :: RemoteStateHandle -> Maybe SafeDropProof -> Key -> Annex () dropKey :: RemoteStateHandle -> Maybe SafeDropProof -> Key -> Annex ()
dropKey rs _ k = setRemoteState rs k mempty dropKey rs _ k = do
RemoteMetaData _ old <- getCurrentRemoteMetaData rs k
addRemoteMetaData k rs (modMeta old DelAllMeta)
storeKeyUnsupported :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex () storeKeyUnsupported :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex ()
storeKeyUnsupported _ _ _ _ = giveup "transfer to compute remote not supported; use git-annex addcomputed instead" storeKeyUnsupported _ _ _ _ = giveup "transfer to compute remote not supported; use git-annex addcomputed instead"

View file

@ -79,14 +79,19 @@ outputs.
The output is lines, in the form: The output is lines, in the form:
INPUT[?] Id Description INPUT[?] Name Description
VALUE[?] Id Description VALUE[?] Name Description
OUTPUT Id Description OUTPUT Id Description
Use "INPUT" when a file is an input to the computation, Use "INPUT" when a file is an input to the computation,
and "VALUE" for all other input values. Use "INPUT?" and "VALUE?" and "VALUE" for all other input values. Use "INPUT?" and "VALUE?"
for optional inputs and values. for optional inputs and values.
Note that the Name and Id both have to be legal git-annex metadata field
names. And should be lower cased. The user is allowed to use any case
for the names when providing inputs and values to `git-annex addcomputed`
though.
The interface can also optionally include a "REPRODUCIBLE" line. The interface can also optionally include a "REPRODUCIBLE" line.
That indicates that the results of its computations are That indicates that the results of its computations are
expected to be bit-for-bit reproducible. expected to be bit-for-bit reproducible.

View file

@ -0,0 +1,13 @@
[[!comment format=mdwn
username="joey"
subject="""comment 18"""
date="2025-02-19T18:29:58Z"
content="""
I've started a `compute` branch which so far has documentation for
the [compute special remote](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/special_remotes/compute.mdwn;hb=refs/heads/compute),
[git-annex addcomputed](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/git-annex-addcomputed.mdwn;hb=refs/heads/compute),
and
[git-annex recompute](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/git-annex-recompute.mdwn;hb=refs/heads/compute)
I am pretty happy with how this design is shaping up.
"""]]

View file

@ -0,0 +1,69 @@
[[!comment format=mdwn
username="joey"
subject="""open questions"""
date="2025-02-19T18:39:41Z"
content="""
One thing that I am unsure about is what should happen if `git-annex get foo`
needs the content of file `bar`, which is not present. Should it get `bar` from
a remote? Or should it fail to get `foo`?
Consider that, in the case of `git-annex get foo --from computeremote`, the
user has asked it to get a file from that particular remote, not from
whatever remote contains `bar`.
If the same compute remote can also compute `bar`, it seems quite reasonable
for `git-annex get foo --from computeremote` to also compute bar. (This is
similar to a single computation that generates two output files, in which
case getting one of them will get both of them.)
And it seems reasonable for `git-annex get foo` with no specified remote
to also get or compute bar, from whereever.
But, there is no way at the level of a special remote to tell the
difference between those two commands.
Maybe the right answer is to define getting a file from a compute
special remote as including getting its inputs from other remotes.
Preferring getting them from the same compute special remote when possible,
and when not, using the lowest cost remote that works, same as `git-annx
get` does.
Or this could be a configuration of the compute special remote. Maybe some
would want to always get source files, and others would want to never get
source files?
----
A related problem is that, `foo` might be fairly small, but `bar` very
large. So getting a small object can require getting or generating other
large objects. Getting `bar` might fail because there is not enough space
to meet annex.diskreserve. Or the user might just be surprised that so much
disk space was eaten up. But dropping `bar` after computing `foo` also
doesn't seem like a good idea; the user might want to hang onto their copy
now that they have it, or perhaps move it to some faster remote.
Maybe preferred content is the solution? After computing `foo` with `bar`,
keep the copy of `bar` if the local repository wants it, drop it otherwise.
----
Progress display is also going to be complicated for this. There is no
way in the special remote interface to display the progress for `bar`
while getting `foo`.
Probably the thing to do would be to add together the sizes of both files,
and display a combined progress meter.
It would be ok to not say when it's getting the input file.
This will need a way to set the size for a progress display to larger
than the size of the key.
----
.... All 3 problems above go away if it doesn't automatically get input files
before computations and the computations instead just fail with an error
saying the input file is not present.
But then consider the case where you just want every file in the repository.
`git-annex get .` failing to compute some files because their input files
happen to come after them in the directory listing is not good.
"""]]