From 19dcff2b71252a92aad6be186a33956309ff7d87 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 30 Aug 2018 14:47:52 -0400 Subject: [PATCH] use S3 version ID for retrieval Have to store the S3 object along with the version ID, so retrieval can use the same object. This commit was supported by the NSF-funded DataLad project. --- Remote/S3.hs | 85 ++++++++++++++-------- doc/todo/versioning_in_export_remotes.mdwn | 6 +- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index a506520878..b8c766420b 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -206,7 +206,7 @@ storeHelper info h f object p = case partSize info of singlepartupload = do rbody <- liftIO $ httpBodyStorer f p r <- sendS3Handle h $ putObject info object rbody - return (mkS3VersionID (S3.porVersionId r)) + return (mkS3VersionID object (S3.porVersionId r)) multipartupload fsz partsz = do #if MIN_VERSION_aws(0,10,6) let startreq = (S3.postInitiateMultipartUpload (bucket info) object) @@ -245,7 +245,7 @@ storeHelper info h f object p = case partSize info of r <- sendS3Handle h $ S3.postCompleteMultipartUpload (bucket info) object uploadid (zip [1..] etags) - return (mkS3VersionID (S3.cmurVersionId r)) + return (mkS3VersionID object (S3.cmurVersionId r)) #else warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ ") of large file (" ++ show fsz ++ "); built with too old a version of the aws library." singlepartupload @@ -255,8 +255,9 @@ storeHelper info h f object p = case partSize info of - out to the file. Would be better to implement a byteRetriever, but - that is difficult. -} retrieve :: Remote -> S3Info -> Maybe S3Handle -> Retriever -retrieve _ info (Just h) = fileRetriever $ \f k p -> - retrieveHelper info h (T.pack $ bucketObject info k) f p +retrieve r info (Just h) = fileRetriever $ \f k p -> do + loc <- getS3VersionID info (uuid r) k (T.pack $ bucketObject info k) + retrieveHelper info h loc f p retrieve r info Nothing = case getpublicurl info of Nothing -> \_ _ _ -> do needS3Creds (uuid r) @@ -265,9 +266,12 @@ retrieve r info Nothing = case getpublicurl info of unlessM (downloadUrl k p [geturl $ bucketObject info k] f) $ giveup "failed to download content" -retrieveHelper :: S3Info -> S3Handle -> S3.Object -> FilePath -> MeterUpdate -> Annex () -retrieveHelper info h object f p = liftIO $ runResourceT $ do - let req = S3.getObject (bucket info) object +retrieveHelper :: S3Info -> S3Handle -> (Either S3.Object S3VersionID) -> FilePath -> MeterUpdate -> Annex () +retrieveHelper info h loc f p = liftIO $ runResourceT $ do + let req = case loc of + Left o -> S3.getObject (bucket info) o + Right (S3VersionID o vid) -> (S3.getObject (bucket info) o) + { S3.goVersionId = Just (T.pack vid) } S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req Url.sinkResponseFile p zeroBytesProcessed f WriteMode rsp @@ -294,10 +298,11 @@ checkKey r info Nothing k = case getpublicurl info of checkBoth (geturl $ bucketObject info k) (keySize k) checkKey r info (Just h) k = do showChecking r - checkKeyHelper info h (T.pack $ bucketObject info k) + loc <- getS3VersionID info (uuid r) k (T.pack $ bucketObject info k) + checkKeyHelper info h loc -checkKeyHelper :: S3Info -> S3Handle -> S3.Object -> Annex Bool -checkKeyHelper info h object = do +checkKeyHelper :: S3Info -> S3Handle -> (Either S3.Object S3VersionID) -> Annex Bool +checkKeyHelper info h loc = do #if MIN_VERSION_aws(0,10,0) rsp <- go return (isJust $ S3.horMetadata rsp) @@ -307,7 +312,11 @@ checkKeyHelper info h object = do return True #endif where - go = sendS3Handle h $ S3.headObject (bucket info) object + go = sendS3Handle h req + req = case loc of + Left o -> S3.headObject (bucket info) o + Right (S3VersionID o vid) -> (S3.headObject (bucket info) o) + { S3.hoVersionId = Just (T.pack vid) } #if ! MIN_VERSION_aws(0,10,0) {- Catch exception headObject returns when an object is not present @@ -327,10 +336,9 @@ storeExportS3 u info (Just h) f k loc p = catchNonAsync go (\e -> warning (show e) >> return False) where go = do - storeHelper info h f (T.pack $ bucketExportLocation info loc) p - >>= if versioning info - then setS3VersionID u k - else const noop + let o = T.pack $ bucketExportLocation info loc + storeHelper info h f o p + >>= setS3VersionID info u k return True storeExportS3 u _ Nothing _ _ _ _ = do needS3Creds u @@ -342,7 +350,7 @@ retrieveExportS3 u info mh _k loc f p = where go = case mh of Just h -> do - retrieveHelper info h (T.pack exporturl) f p + retrieveHelper info h (Left (T.pack exporturl)) f p return True Nothing -> case getpublicurl info of Nothing -> do @@ -366,7 +374,7 @@ removeExportS3 u _ Nothing _ _ = do checkPresentExportS3 :: UUID -> S3Info -> Maybe S3Handle -> Key -> ExportLocation -> Annex Bool checkPresentExportS3 _u info (Just h) _k loc = - checkKeyHelper info h (T.pack $ bucketExportLocation info loc) + checkKeyHelper info h (Left (T.pack $ bucketExportLocation info loc)) checkPresentExportS3 u info Nothing k loc = case getpublicurl info of Nothing -> do needS3Creds u @@ -732,26 +740,45 @@ getWebUrls info c k (True, Just geturl) -> return [geturl $ bucketObject info k] _ -> return [] -newtype S3VersionID = S3VersionID String +data S3VersionID = S3VersionID S3.Object String deriving (Show) -- smart constructor -mkS3VersionID :: Maybe T.Text -> Maybe S3VersionID -mkS3VersionID = mkS3VersionID' . fmap T.unpack +mkS3VersionID :: S3.Object -> Maybe T.Text -> Maybe S3VersionID +mkS3VersionID o = mkS3VersionID' o . fmap T.unpack -mkS3VersionID' :: Maybe String -> Maybe S3VersionID -mkS3VersionID' Nothing = Nothing -mkS3VersionID' (Just s) +mkS3VersionID' :: S3.Object -> Maybe String -> Maybe S3VersionID +mkS3VersionID' o (Just s) | null s = Nothing -- AWS documentation says a version ID is at most 1024 bytes long. -- Since they are stored in the git-annex branch, prevent them from -- being very much larger than that. - | length s < 2048 = Just (S3VersionID s) + | length s < 2048 = Just (S3VersionID o s) | otherwise = Nothing +mkS3VersionID' _ Nothing = Nothing -setS3VersionID :: UUID -> Key -> Maybe S3VersionID -> Annex () -setS3VersionID u k (Just (S3VersionID v)) = setRemoteState u k v -setS3VersionID _ _ Nothing = noop +-- A S3 version ID is "url ready" so does not contain spaces, +-- but an Object may contain spaces, so put it last. +formatS3VersionID :: S3VersionID -> String +formatS3VersionID (S3VersionID o v) = v ++ ' ' : T.unpack o -getS3VersionID :: UUID -> Key -> Annex (Maybe S3VersionID) -getS3VersionID u k = mkS3VersionID' <$> getRemoteState u k +parseS3VersionID :: String -> Maybe S3VersionID +parseS3VersionID s = + let (v, o) = separate (== ' ') s + in mkS3VersionID' (T.pack o) (Just v) + +setS3VersionID :: S3Info -> UUID -> Key -> Maybe S3VersionID -> Annex () +setS3VersionID info u k vid + | versioning info = maybe noop (setS3VersionID' u k) vid + | otherwise = noop + +setS3VersionID' :: UUID -> Key -> S3VersionID -> Annex () +setS3VersionID' u k vid = setRemoteState u k (formatS3VersionID vid) + +getS3VersionID :: S3Info -> UUID -> Key -> S3.Object -> Annex (Either S3.Object S3VersionID) +getS3VersionID info u k fallback + | versioning info = maybe (Left fallback) Right <$> getS3VersionID' u k + | otherwise = return (Left fallback) + +getS3VersionID' :: UUID -> Key -> Annex (Maybe S3VersionID) +getS3VersionID' u k = maybe Nothing parseS3VersionID <$> getRemoteState u k diff --git a/doc/todo/versioning_in_export_remotes.mdwn b/doc/todo/versioning_in_export_remotes.mdwn index 0d157a3258..63c217e507 100644 --- a/doc/todo/versioning_in_export_remotes.mdwn +++ b/doc/todo/versioning_in_export_remotes.mdwn @@ -63,9 +63,11 @@ done Make S3 store version IDs for exported files in the per-remote log when so configured. done -Use version IDs when retrieving keys and for checkpresent. +Use version IDs when retrieving keys and for checkpresent. done -Can public urls be generated using version IDs? +Can public urls be generated using version IDs? The url has +"?versionId=" appended, but all the examples I've seen include S3 +authorization headers. When a file was deleted from an exported tree, and then put back in a later exported tree, it might get re-uploaded even though the content