avoid storing same filename repeatedly in versioned import from S3

Logically, this should make it need a lot less memory when files have
been changed many times. In my tests, it didn't seem to change memory
use at all. Unsure why, it is working. It's possible the Response is not
getting garbage collected due to pinning. But as far as I can see, all
parts of it that are retained get copied in a way that won't keep the
whole thing pinned in memory.
This commit is contained in:
Joey Hess 2024-11-15 15:11:56 -04:00
parent dc5bf24823
commit 51b2d6d8c5
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38

View file

@ -582,7 +582,7 @@ listImportableContentsS3 hv r info c =
| versioning info = do | versioning info = do
rsp <- sendS3Handle h $ rsp <- sendS3Handle h $
S3.getBucketObjectVersions (bucket info) S3.getBucketObjectVersions (bucket info)
continuelistversioned 0 h [] rsp continuelistversioned Nothing h [] rsp
| otherwise = do | otherwise = do
rsp <- sendS3Handle h $ rsp <- sendS3Handle h $
(S3.getBucket (bucket info)) (S3.getBucket (bucket info))
@ -608,11 +608,13 @@ listImportableContentsS3 hv r info c =
Nothing -> nomore Nothing -> nomore
| otherwise = nomore | otherwise = nomore
where where
nomore = return $ nomore = do
mkImportableContentsUnversioned l' <- extractFromResourceT $
(reverse (extractunversioned rsp:l)) extractunversioned rsp
return $ mkImportableContentsUnversioned
(reverse (l':l))
continuelistversioned n h l rsp continuelistversioned reuse h l rsp
| S3.gbovrIsTruncated rsp = do | S3.gbovrIsTruncated rsp = do
rsp' <- sendS3Handle h $ rsp' <- sendS3Handle h $
(S3.getBucketObjectVersions (bucket info)) (S3.getBucketObjectVersions (bucket info))
@ -620,12 +622,14 @@ listImportableContentsS3 hv r info c =
, S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp , S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp
, S3.gbovPrefix = fileprefix , S3.gbovPrefix = fileprefix
} }
l' <- extractFromResourceT $ (l', reuse') <- extractFromResourceT $
extractversioned rsp extractversioned reuse rsp
continuelistversioned (length l' + n) h (l':l) rsp' continuelistversioned reuse' h (l':l) rsp'
| otherwise = return $ | otherwise = do
mkImportableContentsVersioned (l', _) <- extractFromResourceT $
(reverse (extractversioned rsp:l)) extractversioned reuse rsp
return $ mkImportableContentsVersioned
(reverse (l':l))
extractunversioned = mapMaybe extractunversioned' . S3.gbrContents extractunversioned = mapMaybe extractunversioned' . S3.gbrContents
extractunversioned' oi = do extractunversioned' oi = do
@ -635,14 +639,24 @@ listImportableContentsS3 hv r info c =
let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi
return (loc, (cid, sz)) return (loc, (cid, sz))
extractversioned = mapMaybe extractversioned' . S3.gbovrContents extractversioned reuse = extractversioned' reuse . S3.gbovrContents
extractversioned' ovi@(S3.ObjectVersion {}) = do extractversioned' reuse [] = ([], reuse)
extractversioned' reuse (x:xs) = case extractversioned'' reuse x of
Just (v, reuse') ->
let (l, reuse'') = extractversioned' reuse' xs
in (v:l, reuse'')
Nothing -> extractversioned' reuse xs
extractversioned'' reuse ovi@(S3.ObjectVersion {}) = do
loc <- bucketImportLocation info $ loc <- bucketImportLocation info $
T.unpack $ S3.oviKey ovi T.unpack $ S3.oviKey ovi
-- Avoid storing the same filename in memory repeatedly.
let loc' = case reuse of
Just reuseloc | reuseloc == loc -> reuseloc
_ -> loc
let sz = S3.oviSize ovi let sz = S3.oviSize ovi
let cid = mkS3VersionedContentIdentifier' ovi let cid = mkS3VersionedContentIdentifier' ovi
return ((loc, (cid, sz)), S3.oviLastModified ovi) return (((loc', (cid, sz)), S3.oviLastModified ovi), Just loc')
extractversioned' (S3.DeleteMarker {}) = Nothing extractversioned'' _ (S3.DeleteMarker {}) = Nothing
mkImportableContentsUnversioned :: [[(ImportLocation, (ContentIdentifier, ByteSize))]] -> ImportableContents (ContentIdentifier, ByteSize) mkImportableContentsUnversioned :: [[(ImportLocation, (ContentIdentifier, ByteSize))]] -> ImportableContents (ContentIdentifier, ByteSize)
mkImportableContentsUnversioned l = ImportableContents mkImportableContentsUnversioned l = ImportableContents