wip
This commit is contained in:
commit
e897229088
4 changed files with 300 additions and 42 deletions
|
@ -10,6 +10,7 @@ module Remote.Compute (remote) where
|
|||
import Annex.Common
|
||||
import Types.Remote
|
||||
import Types.ProposedAccepted
|
||||
import Types.MetaData
|
||||
import Types.Creds
|
||||
import Config
|
||||
import Config.Cost
|
||||
|
@ -17,24 +18,30 @@ import Remote.Helper.Special
|
|||
import Remote.Helper.ExportImport
|
||||
import Annex.SpecialRemote.Config
|
||||
import Annex.UUID
|
||||
import Logs.RemoteState
|
||||
import Logs.MetaData
|
||||
import Utility.Metered
|
||||
import Utility.Hash
|
||||
import Utility.TimeStamp
|
||||
import Git.FilePath
|
||||
import qualified Git
|
||||
import qualified Utility.SimpleProtocol as Proto
|
||||
|
||||
import Control.Concurrent.STM
|
||||
import Data.Time.Clock
|
||||
import Data.Either
|
||||
import Data.Char
|
||||
import Data.Ord
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Set as S
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Text.Encoding as T
|
||||
|
||||
remote :: RemoteType
|
||||
remote = RemoteType
|
||||
{ typename = "compute"
|
||||
, enumerate = const $ findSpecialRemotes "compute"
|
||||
, generate = gen
|
||||
, configParser = mkRemoteConfigParser
|
||||
[ optionalStringParser programField
|
||||
(FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")")
|
||||
]
|
||||
, configParser = computeConfigParser
|
||||
, setup = setupInstance
|
||||
, exportSupported = exportUnsupported
|
||||
, importSupported = importUnsupported
|
||||
|
@ -55,13 +62,13 @@ gen r u rc gc rs = case getComputeProgram rc of
|
|||
, cost = cst
|
||||
, name = Git.repoDescribe r
|
||||
, storeKey = storeKeyUnsupported
|
||||
, retrieveKeyFile = computeKey program interface
|
||||
, retrieveKeyFile = computeKey rs program interface
|
||||
, retrieveKeyFileInOrder = pure True
|
||||
, retrieveKeyFileCheap = Nothing
|
||||
, retrievalSecurityPolicy = RetrievalAllKeysSecure
|
||||
, removeKey = dropKey rs
|
||||
, lockContent = Nothing
|
||||
, checkPresent = checkKey program interface
|
||||
, checkPresent = checkKey rs program interface
|
||||
, checkPresentCheap = False
|
||||
, exportActions = exportUnsupported
|
||||
, importActions = importUnsupported
|
||||
|
@ -93,6 +100,33 @@ setupInstance _ mu _ c _ = do
|
|||
gitConfigSpecialRemote u c [("compute", "true")]
|
||||
return (c, u)
|
||||
|
||||
-- The RemoteConfig is allowed to contain fields from the program's
|
||||
-- interface. That provides defaults for git-annex addcomputed.
|
||||
computeConfigParser :: RemoteConfig -> Annex RemoteConfigParser
|
||||
computeConfigParser rc = do
|
||||
Interface interface <- case getComputeProgram rc of
|
||||
Left _ -> pure $ Interface []
|
||||
Right program -> liftIO (getInterfaceUncached program) >>= return . \case
|
||||
Left _ -> Interface []
|
||||
Right interface -> interface
|
||||
let m = M.fromList $ mapMaybe collectfields interface
|
||||
let ininterface f = case toField (fromProposedAccepted f) of
|
||||
Just f' -> M.member f' m
|
||||
Nothing -> False
|
||||
return $ RemoteConfigParser
|
||||
{ remoteConfigFieldParsers =
|
||||
[ optionalStringParser programField
|
||||
(FieldDesc $ "compute program (must start with \"" ++ safetyPrefix ++ "\")")
|
||||
]
|
||||
, remoteConfigRestPassthrough = Just (ininterface, M.toList $ M.mapKeys fromField m)
|
||||
}
|
||||
where
|
||||
collectfields (InterfaceInput f d) = Just (f, FieldDesc d)
|
||||
collectfields (InterfaceOptionalInput f d) = Just (f, FieldDesc d)
|
||||
collectfields (InterfaceValue f d) = Just (f, FieldDesc d)
|
||||
collectfields (InterfaceOptionalValue f d) = Just (f, FieldDesc d)
|
||||
collectfields _ = Nothing
|
||||
|
||||
newtype ComputeProgram = ComputeProgram String
|
||||
deriving (Show)
|
||||
|
||||
|
@ -114,16 +148,17 @@ safetyPrefix = "git-annex-compute-"
|
|||
programField :: RemoteConfigField
|
||||
programField = Accepted "program"
|
||||
|
||||
type Name = String
|
||||
type Description = String
|
||||
type Id = String
|
||||
|
||||
newtype Field = Field MetaField
|
||||
deriving (Show, Eq, Ord)
|
||||
|
||||
data InterfaceItem
|
||||
= InterfaceInput Id Description
|
||||
| InterfaceOptionalInput Id Description
|
||||
| InterfaceValue Name Description
|
||||
| InterfaceOptionalValue Name Description
|
||||
| InterfaceOutput Id Description
|
||||
= InterfaceInput Field Description
|
||||
| InterfaceOptionalInput Field Description
|
||||
| InterfaceValue Field Description
|
||||
| InterfaceOptionalValue Field Description
|
||||
| InterfaceOutput Field Description
|
||||
| InterfaceReproducible
|
||||
deriving (Show, Eq)
|
||||
|
||||
|
@ -141,10 +176,22 @@ instance Proto.Receivable InterfaceItem where
|
|||
parseCommand "OUTPUT" = Proto.parse2 InterfaceOutput
|
||||
parseCommand "REPRODUCIBLE" = Proto.parse0 InterfaceReproducible
|
||||
|
||||
instance Proto.Serializable Field where
|
||||
serialize = fromField
|
||||
deserialize = toField
|
||||
|
||||
-- While MetaField is case insensitive, environment variable names are not,
|
||||
-- so make Field always lower cased.
|
||||
toField :: String -> Maybe Field
|
||||
toField f = Field <$> toMetaField (T.pack (map toLower f))
|
||||
|
||||
fromField :: Field -> String
|
||||
fromField (Field f) = T.unpack (fromMetaField f)
|
||||
|
||||
getInterface :: ComputeProgram -> TMVar (Maybe Interface) -> IO (Either String Interface)
|
||||
getInterface program iv =
|
||||
atomically (takeTMVar iv) >>= \case
|
||||
Nothing -> getInterface' program >>= \case
|
||||
Nothing -> getInterfaceUncached program >>= \case
|
||||
Left err -> do
|
||||
atomically $ putTMVar iv Nothing
|
||||
return (Left err)
|
||||
|
@ -155,8 +202,8 @@ getInterface program iv =
|
|||
atomically $ putTMVar iv (Just interface)
|
||||
return (Right interface)
|
||||
|
||||
getInterface' :: ComputeProgram -> IO (Either String Interface)
|
||||
getInterface' (ComputeProgram program) =
|
||||
getInterfaceUncached :: ComputeProgram -> IO (Either String Interface)
|
||||
getInterfaceUncached (ComputeProgram program) =
|
||||
catchMaybeIO (readProcess program ["interface"]) >>= \case
|
||||
Nothing -> return $ Left $ "Failed to run " ++ program
|
||||
Just output -> return $ case parseInterface output of
|
||||
|
@ -179,44 +226,168 @@ data ComputeInput = ComputeInput Key FilePath
|
|||
deriving (Show, Eq)
|
||||
|
||||
data ComputeValue = ComputeValue String
|
||||
deriving (Show, Eq)
|
||||
|
||||
data ComputeOutput = ComputeOutput Key
|
||||
deriving (Show, Eq)
|
||||
|
||||
data ComputeState = ComputeState
|
||||
{ computeInputs :: M.Map Id ComputInput
|
||||
, computeValues :: M.Map Id ComputeValue
|
||||
{ computeInputs :: M.Map Field ComputeInput
|
||||
, computeValues :: M.Map Field ComputeValue
|
||||
, computeOutputs :: M.Map Field ComputeOutput
|
||||
, computeTimeEstimate :: NominalDiffTime
|
||||
}
|
||||
deriving (Show, Eq)
|
||||
|
||||
-- The state is URI encoded.
|
||||
-- Generates a hash of a ComputeState.
|
||||
--
|
||||
-- A ComputeValue with Id "foo" is represented as "vfoo=value"
|
||||
-- A ComputeInput with Id "foo" is represented as "kfoo=key&pfoo=path"
|
||||
formatComputeState :: ComputeState -> String
|
||||
formatComputeState st =
|
||||
map formatinput (computeInputes st)
|
||||
++ concatMap formatvalue (computeValues st)
|
||||
-- This is used as a short unique identifier in the metadata fields,
|
||||
-- since more than one ComputeState may be stored in the compute remote's
|
||||
-- metadata for a given Key.
|
||||
--
|
||||
-- A md5 is fine for this. It does not need to protect against intentional
|
||||
-- collisions. And 2^64 is a sufficiently small chance of accidental
|
||||
-- collision.
|
||||
hashComputeState :: ComputeState -> String
|
||||
hashComputeState state = show $ md5s $
|
||||
mconcat (map (go goi) (M.toAscList (computeInputs state)))
|
||||
<>
|
||||
mconcat (map (go gov) (M.toAscList (computeValues state)))
|
||||
<>
|
||||
mconcat (map (go goo) (M.toAscList (computeOutputs state)))
|
||||
<>
|
||||
encodeBS (show (computeTimeEstimate state))
|
||||
where
|
||||
go c (Field f, v) = T.encodeUtf8 (fromMetaField f) <> c v
|
||||
goi (ComputeInput k f) = serializeKey' k <> encodeBS f
|
||||
gov (ComputeValue s) = encodeBS s
|
||||
goo (ComputeOutput k) = serializeKey' k
|
||||
|
||||
parseComputeState :: String -> ComputeState
|
||||
parseComputeState =
|
||||
computeStateMetaData :: ComputeState -> MetaData
|
||||
computeStateMetaData = undefined
|
||||
|
||||
-- TODO
|
||||
computeKey :: ComputeProgram -> TMVar (Maybe Interface) -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification
|
||||
computeKey program iv key _af dest p vc =
|
||||
-- FIXME: Need to unswizzle the mixed up metadata based on hash prefixes.
|
||||
metaDataComputeStates :: MetaData -> [ComputeState]
|
||||
metaDataComputeStates (MetaData m) =
|
||||
go (ComputeState mempty mempty mempty 0) (M.toList m)
|
||||
where
|
||||
go c ((f,v):rest) =
|
||||
let c' = case T.unpack (fromMetaField f) of
|
||||
('i':'n':'p':'u':'t':'-':f') -> case M.lookup m =<< toMetaField (T.pack ("key-" ++ f')) of
|
||||
Nothing -> c
|
||||
Just kv -> case deserializeKey' (fromMetaValue kv) of
|
||||
Just k -> c
|
||||
{ computeInputs =
|
||||
M.insert (toField f)
|
||||
(ComputeInput k (decodeBS (fromMetaValue v)))
|
||||
(computeOutputs c)
|
||||
}
|
||||
Nothing -> c
|
||||
('v':'a':'l':'u':'e':'-':f') -> c
|
||||
{ computeValues =
|
||||
M.insert (toField f)
|
||||
(ComputeValue (decodeBS (fromMetaValue v)))
|
||||
(computeValues c)
|
||||
}
|
||||
('o':'u':'t':'p':'u':'t':'-':f') ->
|
||||
case deserializeKey' (fromMetaValue v) of
|
||||
Just k -> c
|
||||
{ computeOutputs =
|
||||
M.insert (toField f)
|
||||
(ComputeOutput k)
|
||||
(computeOutputs c)
|
||||
}
|
||||
Nothing -> c
|
||||
('t':'i':'m':'e':'-':f') ->
|
||||
case parsePOSIXTime (fromMetaValue v) of
|
||||
Just t -> c { computeTimeEstimate = t }
|
||||
Nothing -> c
|
||||
_ -> c
|
||||
in go c' rest
|
||||
|
||||
getComputeStates :: RemoteStateHandle -> Key -> Annex [ComputeState]
|
||||
getComputeStates rs k = do
|
||||
RemoteMetaData _ m <- getCurrentRemoteMetaData rs k
|
||||
return (metaDataComputeStates m)
|
||||
|
||||
setComputeState :: RemoteStateHandle -> Key -> ComputeState -> Annex ()
|
||||
setComputeState rs k st = addRemoteMetaData k rs (computeStateMetaData st)
|
||||
|
||||
{- Finds the first compute state that provides everything required by the
|
||||
- interface, and returns a list of what should be provided to the program
|
||||
- in its environment.
|
||||
-}
|
||||
interfaceEnv :: [ComputeState] -> Interface -> Either String [(String, Either Key String)]
|
||||
interfaceEnv states interface = go Nothing states
|
||||
where
|
||||
go (Just firsterr) [] = Left firsterr
|
||||
go Nothing [] = interfaceEnv' (ComputeState mempty mempty mempty 0) interface
|
||||
go firsterr (state:rest) = case interfaceEnv' state interface of
|
||||
Right v -> Right v
|
||||
Left e
|
||||
| null rest -> Left (fromMaybe e firsterr)
|
||||
| otherwise -> go (firsterr <|> Just e) rest
|
||||
|
||||
interfaceEnv' :: ComputeState -> Interface -> Either String [(String, Either Key String)]
|
||||
interfaceEnv' state (Interface interface) =
|
||||
case partitionEithers (mapMaybe go interface) of
|
||||
([], env) -> Right $
|
||||
map (\(f, v) -> (fromField f, v)) env
|
||||
(problems, _) -> Left $ unlines problems
|
||||
where
|
||||
go (InterfaceInput name desc) =
|
||||
case M.lookup name (computeInputs state) of
|
||||
Just (ComputeInput key _file) -> Just $
|
||||
Right (name, Left key)
|
||||
Nothing -> Just $
|
||||
Left $ "Missing required input \"" ++ fromField name ++ "\" -- " ++ desc
|
||||
go (InterfaceOptionalInput name desc) =
|
||||
case M.lookup name (computeInputs state) of
|
||||
Just (ComputeInput key _file) -> Just $
|
||||
Right (name, Left key)
|
||||
Nothing -> Nothing
|
||||
go (InterfaceValue name desc) =
|
||||
case M.lookup name (computeValues state) of
|
||||
Just (ComputeValue v) -> Just $
|
||||
Right (name, Right v)
|
||||
nothing -> Just $
|
||||
Left $ "Missing required value \"" ++ fromField name ++ "\" -- " ++ desc
|
||||
go (InterfaceOptionalValue name desc) =
|
||||
case M.lookup name (computeValues state) of
|
||||
Just (ComputeValue v) -> Just $
|
||||
Right (name, Right v)
|
||||
Nothing -> Nothing
|
||||
go (InterfaceOutput _ _) = Nothing
|
||||
go InterfaceReproducible = Nothing
|
||||
|
||||
computeKey :: RemoteStateHandle -> ComputeProgram -> TMVar (Maybe Interface) -> Key -> AssociatedFile -> OsPath -> MeterUpdate -> VerifyConfig -> Annex Verification
|
||||
computeKey rs program iv k _af dest p vc =
|
||||
liftIO (getInterface program iv) >>= \case
|
||||
Left err -> giveup err
|
||||
Right interface -> undefined
|
||||
Right interface -> do
|
||||
states <- sortBy (comparing computeTimeEstimate)
|
||||
<$> getComputeStates rs k
|
||||
case interfaceEnv states interface of
|
||||
Left err -> giveup err
|
||||
Right ienv -> undefined -- TODO
|
||||
|
||||
-- TODO Make sure that the remote state meets the program's current
|
||||
-- interface.
|
||||
checkKey :: ComputeProgram -> TMVar (Maybe Interface) -> Key -> Annex Bool
|
||||
checkKey program iv _ =
|
||||
-- Make sure that the compute state has everything needed by
|
||||
-- the program's current interface.
|
||||
checkKey :: RemoteStateHandle -> ComputeProgram -> TMVar (Maybe Interface) -> Key -> Annex Bool
|
||||
checkKey rs program iv k = do
|
||||
states <- getComputeStates rs k
|
||||
liftIO (getInterface program iv) >>= \case
|
||||
Left err -> giveup err
|
||||
Right interface -> undefined
|
||||
Right interface ->
|
||||
case interfaceEnv states interface of
|
||||
Right _ -> return True
|
||||
Left _ -> return False
|
||||
|
||||
-- Removing remote state will prevent computing the key.
|
||||
-- Unsetting the compute state will prevent computing the key.
|
||||
dropKey :: RemoteStateHandle -> Maybe SafeDropProof -> Key -> Annex ()
|
||||
dropKey rs _ k = setRemoteState rs k mempty
|
||||
dropKey rs _ k = do
|
||||
RemoteMetaData _ old <- getCurrentRemoteMetaData rs k
|
||||
addRemoteMetaData k rs (modMeta old DelAllMeta)
|
||||
|
||||
storeKeyUnsupported :: Key -> AssociatedFile -> Maybe OsPath -> MeterUpdate -> Annex ()
|
||||
storeKeyUnsupported _ _ _ _ = giveup "transfer to compute remote not supported; use git-annex addcomputed instead"
|
||||
|
||||
|
|
|
@ -79,14 +79,19 @@ outputs.
|
|||
|
||||
The output is lines, in the form:
|
||||
|
||||
INPUT[?] Id Description
|
||||
VALUE[?] Id Description
|
||||
INPUT[?] Name Description
|
||||
VALUE[?] Name Description
|
||||
OUTPUT Id Description
|
||||
|
||||
Use "INPUT" when a file is an input to the computation,
|
||||
and "VALUE" for all other input values. Use "INPUT?" and "VALUE?"
|
||||
for optional inputs and values.
|
||||
|
||||
Note that the Name and Id both have to be legal git-annex metadata field
|
||||
names. And should be lower cased. The user is allowed to use any case
|
||||
for the names when providing inputs and values to `git-annex addcomputed`
|
||||
though.
|
||||
|
||||
The interface can also optionally include a "REPRODUCIBLE" line.
|
||||
That indicates that the results of its computations are
|
||||
expected to be bit-for-bit reproducible.
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 18"""
|
||||
date="2025-02-19T18:29:58Z"
|
||||
content="""
|
||||
I've started a `compute` branch which so far has documentation for
|
||||
the [compute special remote](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/special_remotes/compute.mdwn;hb=refs/heads/compute),
|
||||
[git-annex addcomputed](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/git-annex-addcomputed.mdwn;hb=refs/heads/compute),
|
||||
and
|
||||
[git-annex recompute](http://source.git-annex.branchable.com/?p=source.git;a=blob;f=doc/git-annex-recompute.mdwn;hb=refs/heads/compute)
|
||||
|
||||
I am pretty happy with how this design is shaping up.
|
||||
"""]]
|
|
@ -0,0 +1,69 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""open questions"""
|
||||
date="2025-02-19T18:39:41Z"
|
||||
content="""
|
||||
One thing that I am unsure about is what should happen if `git-annex get foo`
|
||||
needs the content of file `bar`, which is not present. Should it get `bar` from
|
||||
a remote? Or should it fail to get `foo`?
|
||||
|
||||
Consider that, in the case of `git-annex get foo --from computeremote`, the
|
||||
user has asked it to get a file from that particular remote, not from
|
||||
whatever remote contains `bar`.
|
||||
|
||||
If the same compute remote can also compute `bar`, it seems quite reasonable
|
||||
for `git-annex get foo --from computeremote` to also compute bar. (This is
|
||||
similar to a single computation that generates two output files, in which
|
||||
case getting one of them will get both of them.)
|
||||
|
||||
And it seems reasonable for `git-annex get foo` with no specified remote
|
||||
to also get or compute bar, from whereever.
|
||||
|
||||
But, there is no way at the level of a special remote to tell the
|
||||
difference between those two commands.
|
||||
|
||||
Maybe the right answer is to define getting a file from a compute
|
||||
special remote as including getting its inputs from other remotes.
|
||||
Preferring getting them from the same compute special remote when possible,
|
||||
and when not, using the lowest cost remote that works, same as `git-annx
|
||||
get` does.
|
||||
|
||||
Or this could be a configuration of the compute special remote. Maybe some
|
||||
would want to always get source files, and others would want to never get
|
||||
source files?
|
||||
|
||||
----
|
||||
|
||||
A related problem is that, `foo` might be fairly small, but `bar` very
|
||||
large. So getting a small object can require getting or generating other
|
||||
large objects. Getting `bar` might fail because there is not enough space
|
||||
to meet annex.diskreserve. Or the user might just be surprised that so much
|
||||
disk space was eaten up. But dropping `bar` after computing `foo` also
|
||||
doesn't seem like a good idea; the user might want to hang onto their copy
|
||||
now that they have it, or perhaps move it to some faster remote.
|
||||
|
||||
Maybe preferred content is the solution? After computing `foo` with `bar`,
|
||||
keep the copy of `bar` if the local repository wants it, drop it otherwise.
|
||||
|
||||
----
|
||||
|
||||
Progress display is also going to be complicated for this. There is no
|
||||
way in the special remote interface to display the progress for `bar`
|
||||
while getting `foo`.
|
||||
|
||||
Probably the thing to do would be to add together the sizes of both files,
|
||||
and display a combined progress meter.
|
||||
It would be ok to not say when it's getting the input file.
|
||||
This will need a way to set the size for a progress display to larger
|
||||
than the size of the key.
|
||||
|
||||
----
|
||||
|
||||
.... All 3 problems above go away if it doesn't automatically get input files
|
||||
before computations and the computations instead just fail with an error
|
||||
saying the input file is not present.
|
||||
|
||||
But then consider the case where you just want every file in the repository.
|
||||
`git-annex get .` failing to compute some files because their input files
|
||||
happen to come after them in the directory listing is not good.
|
||||
"""]]
|
Loading…
Add table
Add a link
Reference in a new issue