S3 export (untested)
It opens a http connection per file exported, but then so does git annex copy --to s3. Decided not to munge exported filenames for IA. Too large a chance of the munging having confusing results. Instead, export of files not supported by IA, eg with spaces in their name, will fail. This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
parent
a1b195d84c
commit
44cd5ae313
5 changed files with 121 additions and 64 deletions
|
@ -4,7 +4,7 @@ git-annex (6.20170819) UNRELEASED; urgency=medium
|
||||||
exports of trees to special remotes.
|
exports of trees to special remotes.
|
||||||
* Use git-annex initremote with exporttree=yes to set up a special remote
|
* Use git-annex initremote with exporttree=yes to set up a special remote
|
||||||
for use by git-annex export.
|
for use by git-annex export.
|
||||||
* Implemented export to directory special remotes.
|
* Implemented export to directory and S3 special remotes.
|
||||||
* External special remote protocol extended to support export.
|
* External special remote protocol extended to support export.
|
||||||
* Support building with feed-1.0, while still supporting older versions.
|
* Support building with feed-1.0, while still supporting older versions.
|
||||||
* init: Display an additional message when it detects a filesystem that
|
* init: Display an additional message when it detects a filesystem that
|
||||||
|
|
134
Remote/S3.hs
134
Remote/S3.hs
|
@ -59,7 +59,7 @@ remote = RemoteType
|
||||||
, enumerate = const (findSpecialRemotes "s3")
|
, enumerate = const (findSpecialRemotes "s3")
|
||||||
, generate = gen
|
, generate = gen
|
||||||
, setup = s3Setup
|
, setup = s3Setup
|
||||||
, exportSupported = exportUnsupported
|
, exportSupported = exportIsSupported
|
||||||
}
|
}
|
||||||
|
|
||||||
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
|
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
|
||||||
|
@ -86,7 +86,13 @@ gen r u c gc = do
|
||||||
, lockContent = Nothing
|
, lockContent = Nothing
|
||||||
, checkPresent = checkPresentDummy
|
, checkPresent = checkPresentDummy
|
||||||
, checkPresentCheap = False
|
, checkPresentCheap = False
|
||||||
, exportActions = exportUnsupported
|
, exportActions = ExportActions
|
||||||
|
{ storeExport = storeExportS3 this info
|
||||||
|
, retrieveExport = retrieveExportS3 this info
|
||||||
|
, removeExport = removeExportS3 this info
|
||||||
|
, checkPresentExport = checkPresentExportS3 this info
|
||||||
|
, renameExport = renameExportS3 this info
|
||||||
|
}
|
||||||
, whereisKey = Just (getWebUrls info)
|
, whereisKey = Just (getWebUrls info)
|
||||||
, remoteFsck = Nothing
|
, remoteFsck = Nothing
|
||||||
, repairRepo = Nothing
|
, repairRepo = Nothing
|
||||||
|
@ -107,6 +113,7 @@ s3Setup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteG
|
||||||
s3Setup ss mu mcreds c gc = do
|
s3Setup ss mu mcreds c gc = do
|
||||||
u <- maybe (liftIO genUUID) return mu
|
u <- maybe (liftIO genUUID) return mu
|
||||||
s3Setup' ss u mcreds c gc
|
s3Setup' ss u mcreds c gc
|
||||||
|
|
||||||
s3Setup' :: SetupStage -> UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
s3Setup' :: SetupStage -> UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
|
||||||
s3Setup' ss u mcreds c gc
|
s3Setup' ss u mcreds c gc
|
||||||
| configIA c = archiveorg
|
| configIA c = archiveorg
|
||||||
|
@ -170,25 +177,26 @@ prepareS3HandleMaybe r = resourcePrepare $ const $
|
||||||
|
|
||||||
store :: Remote -> S3Info -> S3Handle -> Storer
|
store :: Remote -> S3Info -> S3Handle -> Storer
|
||||||
store _r info h = fileStorer $ \k f p -> do
|
store _r info h = fileStorer $ \k f p -> do
|
||||||
case partSize info of
|
storeHelper info h f (T.pack $ bucketObject info k) p
|
||||||
Just partsz | partsz > 0 -> do
|
|
||||||
fsz <- liftIO $ getFileSize f
|
|
||||||
if fsz > partsz
|
|
||||||
then multipartupload fsz partsz k f p
|
|
||||||
else singlepartupload k f p
|
|
||||||
_ -> singlepartupload k f p
|
|
||||||
-- Store public URL to item in Internet Archive.
|
-- Store public URL to item in Internet Archive.
|
||||||
when (isIA info && not (isChunkKey k)) $
|
when (isIA info && not (isChunkKey k)) $
|
||||||
setUrlPresent webUUID k (iaPublicKeyUrl info k)
|
setUrlPresent webUUID k (iaPublicKeyUrl info k)
|
||||||
return True
|
return True
|
||||||
where
|
|
||||||
singlepartupload k f p = do
|
|
||||||
rbody <- liftIO $ httpBodyStorer f p
|
|
||||||
void $ sendS3Handle h $ putObject info (T.pack $ bucketObject info k) rbody
|
|
||||||
multipartupload fsz partsz k f p = do
|
|
||||||
#if MIN_VERSION_aws(0,10,6)
|
|
||||||
let object = T.pack (bucketObject info k)
|
|
||||||
|
|
||||||
|
storeHelper :: S3Info -> S3Handle -> FilePath -> S3.Object -> MeterUpdate -> Annex ()
|
||||||
|
storeHelper info h f object p = case partSize info of
|
||||||
|
Just partsz | partsz > 0 -> do
|
||||||
|
fsz <- liftIO $ getFileSize f
|
||||||
|
if fsz > partsz
|
||||||
|
then multipartupload fsz partsz
|
||||||
|
else singlepartupload
|
||||||
|
_ -> singlepartupload
|
||||||
|
where
|
||||||
|
singlepartupload = do
|
||||||
|
rbody <- liftIO $ httpBodyStorer f p
|
||||||
|
void $ sendS3Handle h $ putObject info object rbody
|
||||||
|
multipartupload fsz partsz = do
|
||||||
|
#if MIN_VERSION_aws(0,10,6)
|
||||||
let startreq = (S3.postInitiateMultipartUpload (bucket info) object)
|
let startreq = (S3.postInitiateMultipartUpload (bucket info) object)
|
||||||
{ S3.imuStorageClass = Just (storageClass info)
|
{ S3.imuStorageClass = Just (storageClass info)
|
||||||
, S3.imuMetadata = metaHeaders info
|
, S3.imuMetadata = metaHeaders info
|
||||||
|
@ -227,16 +235,27 @@ store _r info h = fileStorer $ \k f p -> do
|
||||||
(bucket info) object uploadid (zip [1..] etags)
|
(bucket info) object uploadid (zip [1..] etags)
|
||||||
#else
|
#else
|
||||||
warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ ") of large file (" ++ show fsz ++ "); built with too old a version of the aws library."
|
warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ ") of large file (" ++ show fsz ++ "); built with too old a version of the aws library."
|
||||||
singlepartupload k f p
|
singlepartupload
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
{- Implemented as a fileRetriever, that uses conduit to stream the chunks
|
{- Implemented as a fileRetriever, that uses conduit to stream the chunks
|
||||||
- out to the file. Would be better to implement a byteRetriever, but
|
- out to the file. Would be better to implement a byteRetriever, but
|
||||||
- that is difficult. -}
|
- that is difficult. -}
|
||||||
retrieve :: Remote -> S3Info -> Maybe S3Handle -> Retriever
|
retrieve :: Remote -> S3Info -> Maybe S3Handle -> Retriever
|
||||||
retrieve _ info (Just h) = fileRetriever $ \f k p -> liftIO $ runResourceT $ do
|
retrieve _ info (Just h) = fileRetriever $ \f k p ->
|
||||||
|
retrieveHelper info h (T.pack $ bucketObject info k) f p
|
||||||
|
retrieve r info Nothing = case getpublicurl info of
|
||||||
|
Nothing -> \_ _ _ -> do
|
||||||
|
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
|
||||||
|
return False
|
||||||
|
Just geturl -> fileRetriever $ \f k p ->
|
||||||
|
unlessM (downloadUrl k p [geturl k] f) $
|
||||||
|
giveup "failed to download content"
|
||||||
|
|
||||||
|
retrieveHelper :: S3Info -> S3Handle -> S3.Object -> FilePath -> MeterUpdate -> Annex ()
|
||||||
|
retrieveHelper info h object f p = liftIO $ runResourceT $ do
|
||||||
(fr, fh) <- allocate (openFile f WriteMode) hClose
|
(fr, fh) <- allocate (openFile f WriteMode) hClose
|
||||||
let req = S3.getObject (bucket info) (T.pack $ bucketObject info k)
|
let req = S3.getObject (bucket info) object
|
||||||
S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req
|
S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req
|
||||||
responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed
|
responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed
|
||||||
release fr
|
release fr
|
||||||
|
@ -251,13 +270,6 @@ retrieve _ info (Just h) = fileRetriever $ \f k p -> liftIO $ runResourceT $ do
|
||||||
void $ meterupdate sofar'
|
void $ meterupdate sofar'
|
||||||
S.hPut fh bs
|
S.hPut fh bs
|
||||||
sinkprogressfile fh meterupdate sofar'
|
sinkprogressfile fh meterupdate sofar'
|
||||||
retrieve r info Nothing = case getpublicurl info of
|
|
||||||
Nothing -> \_ _ _ -> do
|
|
||||||
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
|
|
||||||
return False
|
|
||||||
Just geturl -> fileRetriever $ \f k p ->
|
|
||||||
unlessM (downloadUrl k p [geturl k] f) $
|
|
||||||
giveup "failed to download content"
|
|
||||||
|
|
||||||
retrieveCheap :: Key -> AssociatedFile -> FilePath -> Annex Bool
|
retrieveCheap :: Key -> AssociatedFile -> FilePath -> Annex Bool
|
||||||
retrieveCheap _ _ _ = return False
|
retrieveCheap _ _ _ = return False
|
||||||
|
@ -276,8 +288,19 @@ remove info h k
|
||||||
return $ either (const False) (const True) res
|
return $ either (const False) (const True) res
|
||||||
|
|
||||||
checkKey :: Remote -> S3Info -> Maybe S3Handle -> CheckPresent
|
checkKey :: Remote -> S3Info -> Maybe S3Handle -> CheckPresent
|
||||||
|
checkKey r info Nothing k = case getpublicurl info of
|
||||||
|
Nothing -> do
|
||||||
|
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
|
||||||
|
giveup "No S3 credentials configured"
|
||||||
|
Just geturl -> do
|
||||||
|
showChecking r
|
||||||
|
withUrlOptions $ checkBoth (geturl k) (keySize k)
|
||||||
checkKey r info (Just h) k = do
|
checkKey r info (Just h) k = do
|
||||||
showChecking r
|
showChecking r
|
||||||
|
checkKeyHelper info h (T.pack $ bucketObject info k)
|
||||||
|
|
||||||
|
checkKeyHelper :: S3Info -> S3Handle -> S3.Object -> Annex Bool
|
||||||
|
checkKeyHelper info h object = do
|
||||||
#if MIN_VERSION_aws(0,10,0)
|
#if MIN_VERSION_aws(0,10,0)
|
||||||
rsp <- go
|
rsp <- go
|
||||||
return (isJust $ S3.horMetadata rsp)
|
return (isJust $ S3.horMetadata rsp)
|
||||||
|
@ -287,8 +310,7 @@ checkKey r info (Just h) k = do
|
||||||
return True
|
return True
|
||||||
#endif
|
#endif
|
||||||
where
|
where
|
||||||
go = sendS3Handle h $
|
go = sendS3Handle h $ S3.headObject (bucket info) object
|
||||||
S3.headObject (bucket info) (T.pack $ bucketObject info k)
|
|
||||||
|
|
||||||
#if ! MIN_VERSION_aws(0,10,0)
|
#if ! MIN_VERSION_aws(0,10,0)
|
||||||
{- Catch exception headObject returns when an object is not present
|
{- Catch exception headObject returns when an object is not present
|
||||||
|
@ -303,13 +325,50 @@ checkKey r info (Just h) k = do
|
||||||
| otherwise = Nothing
|
| otherwise = Nothing
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
checkKey r info Nothing k = case getpublicurl info of
|
storeExportS3 :: Remote -> S3Info -> FilePath -> Key -> ExportLocation -> MeterUpdate -> Annex Bool
|
||||||
Nothing -> do
|
storeExportS3 r info f _k loc p =
|
||||||
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
|
catchNonAsync go (\e -> warning (show e) >> return False)
|
||||||
giveup "No S3 credentials configured"
|
where
|
||||||
Just geturl -> do
|
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
|
||||||
showChecking r
|
storeHelper info h f (T.pack $ bucketExportLocation info loc) p
|
||||||
withUrlOptions $ checkBoth (geturl k) (keySize k)
|
return True
|
||||||
|
|
||||||
|
retrieveExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> FilePath -> MeterUpdate -> Annex Bool
|
||||||
|
retrieveExportS3 r info _k loc f p =
|
||||||
|
catchNonAsync go (\e -> warning (show e) >> return False)
|
||||||
|
where
|
||||||
|
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
|
||||||
|
retrieveHelper info h (T.pack $ bucketExportLocation info loc) f p
|
||||||
|
return True
|
||||||
|
|
||||||
|
removeExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> Annex Bool
|
||||||
|
removeExportS3 r info _k loc =
|
||||||
|
catchNonAsync go (\e -> warning (show e) >> return False)
|
||||||
|
where
|
||||||
|
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
|
||||||
|
res <- tryNonAsync $ sendS3Handle h $
|
||||||
|
S3.DeleteObject (T.pack $ bucketExportLocation info loc) (bucket info)
|
||||||
|
return $ either (const False) (const True) res
|
||||||
|
|
||||||
|
checkPresentExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> Annex Bool
|
||||||
|
checkPresentExportS3 r info _k loc =
|
||||||
|
catchNonAsync go (\e -> warning (show e) >> return False)
|
||||||
|
where
|
||||||
|
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
|
||||||
|
checkKeyHelper info h (T.pack $ bucketExportLocation info loc)
|
||||||
|
|
||||||
|
renameExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> ExportLocation -> Annex Bool
|
||||||
|
renameExportS3 r info _k src dest = catchNonAsync go (\e -> warning (show e) >> return False)
|
||||||
|
where
|
||||||
|
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
|
||||||
|
-- S3 has no move primitive; copy and delete.
|
||||||
|
void $ sendS3Handle h $ S3.copyObject (bucket info) dstobject
|
||||||
|
(S3.ObjectId (bucket info) srcobject Nothing)
|
||||||
|
S3.CopyMetadata
|
||||||
|
void $ sendS3Handle h $ S3.DeleteObject srcobject (bucket info)
|
||||||
|
return True
|
||||||
|
srcobject = T.pack $ bucketExportLocation info src
|
||||||
|
dstobject = T.pack $ bucketExportLocation info dest
|
||||||
|
|
||||||
{- Generate the bucket if it does not already exist, including creating the
|
{- Generate the bucket if it does not already exist, including creating the
|
||||||
- UUID file within the bucket.
|
- UUID file within the bucket.
|
||||||
|
@ -474,6 +533,7 @@ data S3Info = S3Info
|
||||||
{ bucket :: S3.Bucket
|
{ bucket :: S3.Bucket
|
||||||
, storageClass :: S3.StorageClass
|
, storageClass :: S3.StorageClass
|
||||||
, bucketObject :: Key -> String
|
, bucketObject :: Key -> String
|
||||||
|
, bucketExportLocation :: ExportLocation -> String
|
||||||
, metaHeaders :: [(T.Text, T.Text)]
|
, metaHeaders :: [(T.Text, T.Text)]
|
||||||
, partSize :: Maybe Integer
|
, partSize :: Maybe Integer
|
||||||
, isIA :: Bool
|
, isIA :: Bool
|
||||||
|
@ -491,6 +551,7 @@ extractS3Info c = do
|
||||||
{ bucket = b
|
{ bucket = b
|
||||||
, storageClass = getStorageClass c
|
, storageClass = getStorageClass c
|
||||||
, bucketObject = getBucketObject c
|
, bucketObject = getBucketObject c
|
||||||
|
, bucketExportLocation = getBucketExportLocation c
|
||||||
, metaHeaders = getMetaHeaders c
|
, metaHeaders = getMetaHeaders c
|
||||||
, partSize = getPartSize c
|
, partSize = getPartSize c
|
||||||
, isIA = configIA c
|
, isIA = configIA c
|
||||||
|
@ -554,6 +615,9 @@ getBucketObject c = munge . key2file
|
||||||
Just "ia" -> iaMunge $ getFilePrefix c ++ s
|
Just "ia" -> iaMunge $ getFilePrefix c ++ s
|
||||||
_ -> getFilePrefix c ++ s
|
_ -> getFilePrefix c ++ s
|
||||||
|
|
||||||
|
getBucketExportLocation :: RemoteConfig -> ExportLocation -> FilePath
|
||||||
|
getBucketExportLocation c (ExportLocation loc) = getFilePrefix c ++ loc
|
||||||
|
|
||||||
{- Internet Archive limits filenames to a subset of ascii,
|
{- Internet Archive limits filenames to a subset of ascii,
|
||||||
- with no whitespace. Other characters are xml entity
|
- with no whitespace. Other characters are xml entity
|
||||||
- encoded. -}
|
- encoded. -}
|
||||||
|
|
|
@ -66,6 +66,10 @@ the S3 remote.
|
||||||
so by default, a bucket name is chosen based on the remote name
|
so by default, a bucket name is chosen based on the remote name
|
||||||
and UUID. This can be specified to pick a bucket name.
|
and UUID. This can be specified to pick a bucket name.
|
||||||
|
|
||||||
|
* `exporttree` - Set to "yes" to make this special remote usable
|
||||||
|
by [[git-annex-export]]. It will not be usable as a general-purpose
|
||||||
|
special remote.
|
||||||
|
|
||||||
* `public` - Set to "yes" to allow public read access to files sent
|
* `public` - Set to "yes" to allow public read access to files sent
|
||||||
to the S3 remote. This is accomplished by setting an ACL when each
|
to the S3 remote. This is accomplished by setting an ACL when each
|
||||||
file is uploaded to the remote. So, changes to this setting will
|
file is uploaded to the remote. So, changes to this setting will
|
||||||
|
|
|
@ -55,31 +55,14 @@ from it. Also, git-annex whereis will tell you a public url for the file
|
||||||
on archive.org. (It may take a while for archive.org to make the file
|
on archive.org. (It may take a while for archive.org to make the file
|
||||||
publically visibile.)
|
publically visibile.)
|
||||||
|
|
||||||
Note the use of the SHA256E [[backend|backends]] when adding files. That is
|
## exporting trees
|
||||||
the default backend used by git-annex, but even if you don't normally use
|
|
||||||
it, it makes most sense to use the WORM or SHA256E backend for files that
|
|
||||||
will be stored in the Internet Archive, since the key name will be exposed
|
|
||||||
as the filename there, and since the Archive does special processing of
|
|
||||||
files based on their extension.
|
|
||||||
|
|
||||||
## publishing only one subdirectory
|
By default, files stored in the Internet Archive will show up there named
|
||||||
|
by their git-annex key, not the original filename. If the filenames
|
||||||
|
are important, you can run `git annex initremote` with an additional
|
||||||
|
parameter "exporttree=yes", and then use [[git-annex-export]] to publish
|
||||||
|
a tree of files to the Internet Archive.
|
||||||
|
|
||||||
Perhaps you have a repository with lots of files in it, and only want
|
Note that the Internet Archive does not support filenames containing
|
||||||
to publish some of them to a particular Internet Archive item. Of course
|
whitespace and some other characters. Exporting such problem filenames will
|
||||||
you can specify which files to send manually, but it's useful to
|
fail; you can rename the file and re-export.
|
||||||
configure [[preferred_content]] settings so git-annex knows what content
|
|
||||||
you want to store in the Internet Archive.
|
|
||||||
|
|
||||||
One way to do this is using the "public" repository type.
|
|
||||||
|
|
||||||
git annex enableremote archive-panama preferreddir=panama
|
|
||||||
git annex wanted archive-panama standard
|
|
||||||
git annex group archive-panama public
|
|
||||||
|
|
||||||
Now anything in a "panama" directory will be sent to that remote,
|
|
||||||
and anything else won't. You can use `git annex copy --auto` or the
|
|
||||||
assistant and it'll do the right thing.
|
|
||||||
|
|
||||||
When setting up an Internet Archive item using the webapp, this
|
|
||||||
configuration is automatically done, using an item name that the user
|
|
||||||
enters as the name of the subdirectory.
|
|
||||||
|
|
|
@ -24,8 +24,14 @@ Work is in progress. Todo list:
|
||||||
export from another repository also doesn't work right, because the
|
export from another repository also doesn't work right, because the
|
||||||
export database is not populated. So, seems that the export database needs
|
export database is not populated. So, seems that the export database needs
|
||||||
to get populated based on the export log in these cases.
|
to get populated based on the export log in these cases.
|
||||||
* Support export to aditional special remotes (S3 etc)
|
* Support export to aditional special remotes (webdav etc)
|
||||||
* Support export to external special remotes.
|
* Support export in the assistant (when eg setting up a S3 special remote).
|
||||||
|
Would need git-annex sync to export to the master tree?
|
||||||
|
This is similar to the little-used preferreddir= preferred content
|
||||||
|
setting and the "public" repository group.
|
||||||
|
* Test S3 export.
|
||||||
|
* Test export to IA via S3. In particualar, does removing an exported file
|
||||||
|
work?
|
||||||
|
|
||||||
Low priority:
|
Low priority:
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue