S3 export (untested)

It opens a http connection per file exported, but then so does git
annex copy --to s3.

Decided not to munge exported filenames for IA. Too large a chance of
the munging having confusing results. Instead, export of files not
supported by IA, eg with spaces in their name, will fail.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2017-09-08 15:41:31 -04:00
parent a1b195d84c
commit 44cd5ae313
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 121 additions and 64 deletions

View file

@ -4,7 +4,7 @@ git-annex (6.20170819) UNRELEASED; urgency=medium
exports of trees to special remotes.
* Use git-annex initremote with exporttree=yes to set up a special remote
for use by git-annex export.
* Implemented export to directory special remotes.
* Implemented export to directory and S3 special remotes.
* External special remote protocol extended to support export.
* Support building with feed-1.0, while still supporting older versions.
* init: Display an additional message when it detects a filesystem that

View file

@ -59,7 +59,7 @@ remote = RemoteType
, enumerate = const (findSpecialRemotes "s3")
, generate = gen
, setup = s3Setup
, exportSupported = exportUnsupported
, exportSupported = exportIsSupported
}
gen :: Git.Repo -> UUID -> RemoteConfig -> RemoteGitConfig -> Annex (Maybe Remote)
@ -86,7 +86,13 @@ gen r u c gc = do
, lockContent = Nothing
, checkPresent = checkPresentDummy
, checkPresentCheap = False
, exportActions = exportUnsupported
, exportActions = ExportActions
{ storeExport = storeExportS3 this info
, retrieveExport = retrieveExportS3 this info
, removeExport = removeExportS3 this info
, checkPresentExport = checkPresentExportS3 this info
, renameExport = renameExportS3 this info
}
, whereisKey = Just (getWebUrls info)
, remoteFsck = Nothing
, repairRepo = Nothing
@ -107,6 +113,7 @@ s3Setup :: SetupStage -> Maybe UUID -> Maybe CredPair -> RemoteConfig -> RemoteG
s3Setup ss mu mcreds c gc = do
u <- maybe (liftIO genUUID) return mu
s3Setup' ss u mcreds c gc
s3Setup' :: SetupStage -> UUID -> Maybe CredPair -> RemoteConfig -> RemoteGitConfig -> Annex (RemoteConfig, UUID)
s3Setup' ss u mcreds c gc
| configIA c = archiveorg
@ -170,25 +177,26 @@ prepareS3HandleMaybe r = resourcePrepare $ const $
store :: Remote -> S3Info -> S3Handle -> Storer
store _r info h = fileStorer $ \k f p -> do
case partSize info of
Just partsz | partsz > 0 -> do
fsz <- liftIO $ getFileSize f
if fsz > partsz
then multipartupload fsz partsz k f p
else singlepartupload k f p
_ -> singlepartupload k f p
storeHelper info h f (T.pack $ bucketObject info k) p
-- Store public URL to item in Internet Archive.
when (isIA info && not (isChunkKey k)) $
setUrlPresent webUUID k (iaPublicKeyUrl info k)
return True
where
singlepartupload k f p = do
rbody <- liftIO $ httpBodyStorer f p
void $ sendS3Handle h $ putObject info (T.pack $ bucketObject info k) rbody
multipartupload fsz partsz k f p = do
#if MIN_VERSION_aws(0,10,6)
let object = T.pack (bucketObject info k)
storeHelper :: S3Info -> S3Handle -> FilePath -> S3.Object -> MeterUpdate -> Annex ()
storeHelper info h f object p = case partSize info of
Just partsz | partsz > 0 -> do
fsz <- liftIO $ getFileSize f
if fsz > partsz
then multipartupload fsz partsz
else singlepartupload
_ -> singlepartupload
where
singlepartupload = do
rbody <- liftIO $ httpBodyStorer f p
void $ sendS3Handle h $ putObject info object rbody
multipartupload fsz partsz = do
#if MIN_VERSION_aws(0,10,6)
let startreq = (S3.postInitiateMultipartUpload (bucket info) object)
{ S3.imuStorageClass = Just (storageClass info)
, S3.imuMetadata = metaHeaders info
@ -227,16 +235,27 @@ store _r info h = fileStorer $ \k f p -> do
(bucket info) object uploadid (zip [1..] etags)
#else
warning $ "Cannot do multipart upload (partsize " ++ show partsz ++ ") of large file (" ++ show fsz ++ "); built with too old a version of the aws library."
singlepartupload k f p
singlepartupload
#endif
{- Implemented as a fileRetriever, that uses conduit to stream the chunks
- out to the file. Would be better to implement a byteRetriever, but
- that is difficult. -}
retrieve :: Remote -> S3Info -> Maybe S3Handle -> Retriever
retrieve _ info (Just h) = fileRetriever $ \f k p -> liftIO $ runResourceT $ do
retrieve _ info (Just h) = fileRetriever $ \f k p ->
retrieveHelper info h (T.pack $ bucketObject info k) f p
retrieve r info Nothing = case getpublicurl info of
Nothing -> \_ _ _ -> do
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
return False
Just geturl -> fileRetriever $ \f k p ->
unlessM (downloadUrl k p [geturl k] f) $
giveup "failed to download content"
retrieveHelper :: S3Info -> S3Handle -> S3.Object -> FilePath -> MeterUpdate -> Annex ()
retrieveHelper info h object f p = liftIO $ runResourceT $ do
(fr, fh) <- allocate (openFile f WriteMode) hClose
let req = S3.getObject (bucket info) (T.pack $ bucketObject info k)
let req = S3.getObject (bucket info) object
S3.GetObjectResponse { S3.gorResponse = rsp } <- sendS3Handle' h req
responseBody rsp $$+- sinkprogressfile fh p zeroBytesProcessed
release fr
@ -251,13 +270,6 @@ retrieve _ info (Just h) = fileRetriever $ \f k p -> liftIO $ runResourceT $ do
void $ meterupdate sofar'
S.hPut fh bs
sinkprogressfile fh meterupdate sofar'
retrieve r info Nothing = case getpublicurl info of
Nothing -> \_ _ _ -> do
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
return False
Just geturl -> fileRetriever $ \f k p ->
unlessM (downloadUrl k p [geturl k] f) $
giveup "failed to download content"
retrieveCheap :: Key -> AssociatedFile -> FilePath -> Annex Bool
retrieveCheap _ _ _ = return False
@ -276,8 +288,19 @@ remove info h k
return $ either (const False) (const True) res
checkKey :: Remote -> S3Info -> Maybe S3Handle -> CheckPresent
checkKey r info Nothing k = case getpublicurl info of
Nothing -> do
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
giveup "No S3 credentials configured"
Just geturl -> do
showChecking r
withUrlOptions $ checkBoth (geturl k) (keySize k)
checkKey r info (Just h) k = do
showChecking r
checkKeyHelper info h (T.pack $ bucketObject info k)
checkKeyHelper :: S3Info -> S3Handle -> S3.Object -> Annex Bool
checkKeyHelper info h object = do
#if MIN_VERSION_aws(0,10,0)
rsp <- go
return (isJust $ S3.horMetadata rsp)
@ -287,8 +310,7 @@ checkKey r info (Just h) k = do
return True
#endif
where
go = sendS3Handle h $
S3.headObject (bucket info) (T.pack $ bucketObject info k)
go = sendS3Handle h $ S3.headObject (bucket info) object
#if ! MIN_VERSION_aws(0,10,0)
{- Catch exception headObject returns when an object is not present
@ -303,13 +325,50 @@ checkKey r info (Just h) k = do
| otherwise = Nothing
#endif
checkKey r info Nothing k = case getpublicurl info of
Nothing -> do
warnMissingCredPairFor "S3" (AWS.creds $ uuid r)
giveup "No S3 credentials configured"
Just geturl -> do
showChecking r
withUrlOptions $ checkBoth (geturl k) (keySize k)
storeExportS3 :: Remote -> S3Info -> FilePath -> Key -> ExportLocation -> MeterUpdate -> Annex Bool
storeExportS3 r info f _k loc p =
catchNonAsync go (\e -> warning (show e) >> return False)
where
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
storeHelper info h f (T.pack $ bucketExportLocation info loc) p
return True
retrieveExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> FilePath -> MeterUpdate -> Annex Bool
retrieveExportS3 r info _k loc f p =
catchNonAsync go (\e -> warning (show e) >> return False)
where
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
retrieveHelper info h (T.pack $ bucketExportLocation info loc) f p
return True
removeExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> Annex Bool
removeExportS3 r info _k loc =
catchNonAsync go (\e -> warning (show e) >> return False)
where
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
res <- tryNonAsync $ sendS3Handle h $
S3.DeleteObject (T.pack $ bucketExportLocation info loc) (bucket info)
return $ either (const False) (const True) res
checkPresentExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> Annex Bool
checkPresentExportS3 r info _k loc =
catchNonAsync go (\e -> warning (show e) >> return False)
where
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
checkKeyHelper info h (T.pack $ bucketExportLocation info loc)
renameExportS3 :: Remote -> S3Info -> Key -> ExportLocation -> ExportLocation -> Annex Bool
renameExportS3 r info _k src dest = catchNonAsync go (\e -> warning (show e) >> return False)
where
go = withS3Handle (config r) (gitconfig r) (uuid r) $ \h -> do
-- S3 has no move primitive; copy and delete.
void $ sendS3Handle h $ S3.copyObject (bucket info) dstobject
(S3.ObjectId (bucket info) srcobject Nothing)
S3.CopyMetadata
void $ sendS3Handle h $ S3.DeleteObject srcobject (bucket info)
return True
srcobject = T.pack $ bucketExportLocation info src
dstobject = T.pack $ bucketExportLocation info dest
{- Generate the bucket if it does not already exist, including creating the
- UUID file within the bucket.
@ -474,6 +533,7 @@ data S3Info = S3Info
{ bucket :: S3.Bucket
, storageClass :: S3.StorageClass
, bucketObject :: Key -> String
, bucketExportLocation :: ExportLocation -> String
, metaHeaders :: [(T.Text, T.Text)]
, partSize :: Maybe Integer
, isIA :: Bool
@ -491,6 +551,7 @@ extractS3Info c = do
{ bucket = b
, storageClass = getStorageClass c
, bucketObject = getBucketObject c
, bucketExportLocation = getBucketExportLocation c
, metaHeaders = getMetaHeaders c
, partSize = getPartSize c
, isIA = configIA c
@ -554,6 +615,9 @@ getBucketObject c = munge . key2file
Just "ia" -> iaMunge $ getFilePrefix c ++ s
_ -> getFilePrefix c ++ s
getBucketExportLocation :: RemoteConfig -> ExportLocation -> FilePath
getBucketExportLocation c (ExportLocation loc) = getFilePrefix c ++ loc
{- Internet Archive limits filenames to a subset of ascii,
- with no whitespace. Other characters are xml entity
- encoded. -}

View file

@ -66,6 +66,10 @@ the S3 remote.
so by default, a bucket name is chosen based on the remote name
and UUID. This can be specified to pick a bucket name.
* `exporttree` - Set to "yes" to make this special remote usable
by [[git-annex-export]]. It will not be usable as a general-purpose
special remote.
* `public` - Set to "yes" to allow public read access to files sent
to the S3 remote. This is accomplished by setting an ACL when each
file is uploaded to the remote. So, changes to this setting will

View file

@ -55,31 +55,14 @@ from it. Also, git-annex whereis will tell you a public url for the file
on archive.org. (It may take a while for archive.org to make the file
publically visibile.)
Note the use of the SHA256E [[backend|backends]] when adding files. That is
the default backend used by git-annex, but even if you don't normally use
it, it makes most sense to use the WORM or SHA256E backend for files that
will be stored in the Internet Archive, since the key name will be exposed
as the filename there, and since the Archive does special processing of
files based on their extension.
## exporting trees
## publishing only one subdirectory
By default, files stored in the Internet Archive will show up there named
by their git-annex key, not the original filename. If the filenames
are important, you can run `git annex initremote` with an additional
parameter "exporttree=yes", and then use [[git-annex-export]] to publish
a tree of files to the Internet Archive.
Perhaps you have a repository with lots of files in it, and only want
to publish some of them to a particular Internet Archive item. Of course
you can specify which files to send manually, but it's useful to
configure [[preferred_content]] settings so git-annex knows what content
you want to store in the Internet Archive.
One way to do this is using the "public" repository type.
git annex enableremote archive-panama preferreddir=panama
git annex wanted archive-panama standard
git annex group archive-panama public
Now anything in a "panama" directory will be sent to that remote,
and anything else won't. You can use `git annex copy --auto` or the
assistant and it'll do the right thing.
When setting up an Internet Archive item using the webapp, this
configuration is automatically done, using an item name that the user
enters as the name of the subdirectory.
Note that the Internet Archive does not support filenames containing
whitespace and some other characters. Exporting such problem filenames will
fail; you can rename the file and re-export.

View file

@ -24,8 +24,14 @@ Work is in progress. Todo list:
export from another repository also doesn't work right, because the
export database is not populated. So, seems that the export database needs
to get populated based on the export log in these cases.
* Support export to aditional special remotes (S3 etc)
* Support export to external special remotes.
* Support export to aditional special remotes (webdav etc)
* Support export in the assistant (when eg setting up a S3 special remote).
Would need git-annex sync to export to the master tree?
This is similar to the little-used preferreddir= preferred content
setting and the "public" repository group.
* Test S3 export.
* Test export to IA via S3. In particualar, does removing an exported file
work?
Low priority: