From b4305315b26460bc5f694e50cbab4f2d713ad90e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 10 Oct 2022 17:37:26 -0400 Subject: [PATCH] S3: pass fileprefix into getBucket calls S3: Speed up importing from a large bucket when fileprefix= is set by only asking for files under the prefix. getBucket still returns the files with the prefix included, so the rest of the fileprefix stripping still works unchanged. Sponsored-by: Dartmouth College's DANDI project --- CHANGELOG | 2 ++ Remote/S3.hs | 13 ++++++--- ..._b0d9dbe81f01e80809381a9e5f6a883d._comment | 27 +++++++++++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment diff --git a/CHANGELOG b/CHANGELOG index 8e8a0afb2b..c393dad101 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,8 @@ git-annex (10.20221004) UNRELEASED; urgency=medium do not operate on a repository that has an empty name. * move: Fix openFile crash with -J (Fixes a reversion in 8.20201103) + * S3: Speed up importing from a large bucket when fileprefix= is set + by only asking for files under the prefix. -- Joey Hess Mon, 03 Oct 2022 13:36:42 -0400 diff --git a/Remote/S3.hs b/Remote/S3.hs index 1f0ebd3d5a..46a9bc49ce 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -216,7 +216,7 @@ gen r u rc gc rs = do , renameExport = renameExportS3 hdl this rs info } , importActions = ImportActions - { listImportableContents = listImportableContentsS3 hdl this info + { listImportableContents = listImportableContentsS3 hdl this info c , importKey = Nothing , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierS3 hdl this rs info , storeExportWithContentIdentifier = storeExportWithContentIdentifierS3 hdl this rs info magic @@ -548,8 +548,8 @@ renameExportS3 hv r rs info k src dest = Just <$> go srcobject = T.pack $ bucketExportLocation info src dstobject = T.pack $ bucketExportLocation info dest -listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) -listImportableContentsS3 hv r info = +listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))) +listImportableContentsS3 hv r info c = withS3Handle hv $ \case Nothing -> giveup $ needS3Creds (uuid r) Just h -> Just <$> go h @@ -558,6 +558,8 @@ listImportableContentsS3 hv r info = ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h return (ImportableContentsComplete ic) + fileprefix = T.pack <$> getRemoteConfigValue fileprefixField c + startlist h | versioning info = do rsp <- sendS3Handle h $ @@ -565,7 +567,8 @@ listImportableContentsS3 hv r info = continuelistversioned h [] rsp | otherwise = do rsp <- sendS3Handle h $ - S3.getBucket (bucket info) + (S3.getBucket (bucket info)) + { S3.gbPrefix = fileprefix } continuelistunversioned h [] rsp continuelistunversioned h l rsp @@ -573,6 +576,7 @@ listImportableContentsS3 hv r info = rsp' <- sendS3Handle h $ (S3.getBucket (bucket info)) { S3.gbMarker = S3.gbrNextMarker rsp + , S3.gbPrefix = fileprefix } continuelistunversioned h (rsp:l) rsp' | otherwise = return $ @@ -584,6 +588,7 @@ listImportableContentsS3 hv r info = (S3.getBucketObjectVersions (bucket info)) { S3.gbovKeyMarker = S3.gbovrNextKeyMarker rsp , S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp + , S3.gbovPrefix = fileprefix } continuelistversioned h (rsp:l) rsp' | otherwise = return $ diff --git a/doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment b/doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment new file mode 100644 index 0000000000..7aa88f7cb6 --- /dev/null +++ b/doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment @@ -0,0 +1,27 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 8""" + date="2022-10-10T21:04:49Z" + content=""" +I've finished the work on aws, which is in + and I hope will be merged soon. + +git-annex now has a branch `anons3` that implements this, when +the S3 remote is configured with signature=anonymous. + + $ git-annex initremote s3-origin type=S3 importtree=yes encryption=none bucket=dandiarchive fileprefix=zarr-checksums/2ac71edb-738c-40ac-bd8c-8ca985adaa12/ signature=anonymous + initremote s3-origin (checking bucket...) ok + (recording state in git...) + $ git-annex import master --from s3-origin + list s3-origin ok + import s3-origin .checksum + ok + import s3-origin 0/.checksum + ok + import s3-origin 0/0/.checksum + ok + ^C + +Also, I've fixed it to only list files in the fileprefix, which +sped up the listing a *lot* in this bucket with many other files.. +"""]]