S3: pass fileprefix into getBucket calls
S3: Speed up importing from a large bucket when fileprefix= is set by only asking for files under the prefix. getBucket still returns the files with the prefix included, so the rest of the fileprefix stripping still works unchanged. Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
parent
90f9671e00
commit
b4305315b2
3 changed files with 38 additions and 4 deletions
|
@ -6,6 +6,8 @@ git-annex (10.20221004) UNRELEASED; urgency=medium
|
|||
do not operate on a repository that has an empty name.
|
||||
* move: Fix openFile crash with -J
|
||||
(Fixes a reversion in 8.20201103)
|
||||
* S3: Speed up importing from a large bucket when fileprefix= is set
|
||||
by only asking for files under the prefix.
|
||||
|
||||
-- Joey Hess <id@joeyh.name> Mon, 03 Oct 2022 13:36:42 -0400
|
||||
|
||||
|
|
13
Remote/S3.hs
13
Remote/S3.hs
|
@ -216,7 +216,7 @@ gen r u rc gc rs = do
|
|||
, renameExport = renameExportS3 hdl this rs info
|
||||
}
|
||||
, importActions = ImportActions
|
||||
{ listImportableContents = listImportableContentsS3 hdl this info
|
||||
{ listImportableContents = listImportableContentsS3 hdl this info c
|
||||
, importKey = Nothing
|
||||
, retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierS3 hdl this rs info
|
||||
, storeExportWithContentIdentifier = storeExportWithContentIdentifierS3 hdl this rs info magic
|
||||
|
@ -548,8 +548,8 @@ renameExportS3 hv r rs info k src dest = Just <$> go
|
|||
srcobject = T.pack $ bucketExportLocation info src
|
||||
dstobject = T.pack $ bucketExportLocation info dest
|
||||
|
||||
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
|
||||
listImportableContentsS3 hv r info =
|
||||
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
|
||||
listImportableContentsS3 hv r info c =
|
||||
withS3Handle hv $ \case
|
||||
Nothing -> giveup $ needS3Creds (uuid r)
|
||||
Just h -> Just <$> go h
|
||||
|
@ -558,6 +558,8 @@ listImportableContentsS3 hv r info =
|
|||
ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h
|
||||
return (ImportableContentsComplete ic)
|
||||
|
||||
fileprefix = T.pack <$> getRemoteConfigValue fileprefixField c
|
||||
|
||||
startlist h
|
||||
| versioning info = do
|
||||
rsp <- sendS3Handle h $
|
||||
|
@ -565,7 +567,8 @@ listImportableContentsS3 hv r info =
|
|||
continuelistversioned h [] rsp
|
||||
| otherwise = do
|
||||
rsp <- sendS3Handle h $
|
||||
S3.getBucket (bucket info)
|
||||
(S3.getBucket (bucket info))
|
||||
{ S3.gbPrefix = fileprefix }
|
||||
continuelistunversioned h [] rsp
|
||||
|
||||
continuelistunversioned h l rsp
|
||||
|
@ -573,6 +576,7 @@ listImportableContentsS3 hv r info =
|
|||
rsp' <- sendS3Handle h $
|
||||
(S3.getBucket (bucket info))
|
||||
{ S3.gbMarker = S3.gbrNextMarker rsp
|
||||
, S3.gbPrefix = fileprefix
|
||||
}
|
||||
continuelistunversioned h (rsp:l) rsp'
|
||||
| otherwise = return $
|
||||
|
@ -584,6 +588,7 @@ listImportableContentsS3 hv r info =
|
|||
(S3.getBucketObjectVersions (bucket info))
|
||||
{ S3.gbovKeyMarker = S3.gbovrNextKeyMarker rsp
|
||||
, S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp
|
||||
, S3.gbovPrefix = fileprefix
|
||||
}
|
||||
continuelistversioned h (rsp:l) rsp'
|
||||
| otherwise = return $
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
[[!comment format=mdwn
|
||||
username="joey"
|
||||
subject="""comment 8"""
|
||||
date="2022-10-10T21:04:49Z"
|
||||
content="""
|
||||
I've finished the work on aws, which is in
|
||||
<https://github.com/aristidb/aws/pull/281> and I hope will be merged soon.
|
||||
|
||||
git-annex now has a branch `anons3` that implements this, when
|
||||
the S3 remote is configured with signature=anonymous.
|
||||
|
||||
$ git-annex initremote s3-origin type=S3 importtree=yes encryption=none bucket=dandiarchive fileprefix=zarr-checksums/2ac71edb-738c-40ac-bd8c-8ca985adaa12/ signature=anonymous
|
||||
initremote s3-origin (checking bucket...) ok
|
||||
(recording state in git...)
|
||||
$ git-annex import master --from s3-origin
|
||||
list s3-origin ok
|
||||
import s3-origin .checksum
|
||||
ok
|
||||
import s3-origin 0/.checksum
|
||||
ok
|
||||
import s3-origin 0/0/.checksum
|
||||
ok
|
||||
^C
|
||||
|
||||
Also, I've fixed it to only list files in the fileprefix, which
|
||||
sped up the listing a *lot* in this bucket with many other files..
|
||||
"""]]
|
Loading…
Reference in a new issue