S3: pass fileprefix into getBucket calls

S3: Speed up importing from a large bucket when fileprefix= is set by only
asking for files under the prefix.

getBucket still returns the files with the prefix included, so the rest of
the fileprefix stripping still works unchanged.

Sponsored-by: Dartmouth College's DANDI project
This commit is contained in:
Joey Hess 2022-10-10 17:37:26 -04:00
parent 90f9671e00
commit b4305315b2
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
3 changed files with 38 additions and 4 deletions

View file

@ -6,6 +6,8 @@ git-annex (10.20221004) UNRELEASED; urgency=medium
do not operate on a repository that has an empty name.
* move: Fix openFile crash with -J
(Fixes a reversion in 8.20201103)
* S3: Speed up importing from a large bucket when fileprefix= is set
by only asking for files under the prefix.
-- Joey Hess <id@joeyh.name> Mon, 03 Oct 2022 13:36:42 -0400

View file

@ -216,7 +216,7 @@ gen r u rc gc rs = do
, renameExport = renameExportS3 hdl this rs info
}
, importActions = ImportActions
{ listImportableContents = listImportableContentsS3 hdl this info
{ listImportableContents = listImportableContentsS3 hdl this info c
, importKey = Nothing
, retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierS3 hdl this rs info
, storeExportWithContentIdentifier = storeExportWithContentIdentifierS3 hdl this rs info magic
@ -548,8 +548,8 @@ renameExportS3 hv r rs info k src dest = Just <$> go
srcobject = T.pack $ bucketExportLocation info src
dstobject = T.pack $ bucketExportLocation info dest
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsS3 hv r info =
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsS3 hv r info c =
withS3Handle hv $ \case
Nothing -> giveup $ needS3Creds (uuid r)
Just h -> Just <$> go h
@ -558,6 +558,8 @@ listImportableContentsS3 hv r info =
ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h
return (ImportableContentsComplete ic)
fileprefix = T.pack <$> getRemoteConfigValue fileprefixField c
startlist h
| versioning info = do
rsp <- sendS3Handle h $
@ -565,7 +567,8 @@ listImportableContentsS3 hv r info =
continuelistversioned h [] rsp
| otherwise = do
rsp <- sendS3Handle h $
S3.getBucket (bucket info)
(S3.getBucket (bucket info))
{ S3.gbPrefix = fileprefix }
continuelistunversioned h [] rsp
continuelistunversioned h l rsp
@ -573,6 +576,7 @@ listImportableContentsS3 hv r info =
rsp' <- sendS3Handle h $
(S3.getBucket (bucket info))
{ S3.gbMarker = S3.gbrNextMarker rsp
, S3.gbPrefix = fileprefix
}
continuelistunversioned h (rsp:l) rsp'
| otherwise = return $
@ -584,6 +588,7 @@ listImportableContentsS3 hv r info =
(S3.getBucketObjectVersions (bucket info))
{ S3.gbovKeyMarker = S3.gbovrNextKeyMarker rsp
, S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp
, S3.gbovPrefix = fileprefix
}
continuelistversioned h (rsp:l) rsp'
| otherwise = return $

View file

@ -0,0 +1,27 @@
[[!comment format=mdwn
username="joey"
subject="""comment 8"""
date="2022-10-10T21:04:49Z"
content="""
I've finished the work on aws, which is in
<https://github.com/aristidb/aws/pull/281> and I hope will be merged soon.
git-annex now has a branch `anons3` that implements this, when
the S3 remote is configured with signature=anonymous.
$ git-annex initremote s3-origin type=S3 importtree=yes encryption=none bucket=dandiarchive fileprefix=zarr-checksums/2ac71edb-738c-40ac-bd8c-8ca985adaa12/ signature=anonymous
initremote s3-origin (checking bucket...) ok
(recording state in git...)
$ git-annex import master --from s3-origin
list s3-origin ok
import s3-origin .checksum
ok
import s3-origin 0/.checksum
ok
import s3-origin 0/0/.checksum
ok
^C
Also, I've fixed it to only list files in the fileprefix, which
sped up the listing a *lot* in this bucket with many other files..
"""]]