From 51b2d6d8c5c6494bdb8d72d227e95ec351a010ac Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 15 Nov 2024 15:11:56 -0400 Subject: [PATCH] avoid storing same filename repeatedly in versioned import from S3 Logically, this should make it need a lot less memory when files have been changed many times. In my tests, it didn't seem to change memory use at all. Unsure why, it is working. It's possible the Response is not getting garbage collected due to pinning. But as far as I can see, all parts of it that are retained get copied in a way that won't keep the whole thing pinned in memory. --- Remote/S3.hs | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/Remote/S3.hs b/Remote/S3.hs index 423a254ce8..b79b4778be 100644 --- a/Remote/S3.hs +++ b/Remote/S3.hs @@ -582,7 +582,7 @@ listImportableContentsS3 hv r info c = | versioning info = do rsp <- sendS3Handle h $ S3.getBucketObjectVersions (bucket info) - continuelistversioned 0 h [] rsp + continuelistversioned Nothing h [] rsp | otherwise = do rsp <- sendS3Handle h $ (S3.getBucket (bucket info)) @@ -608,11 +608,13 @@ listImportableContentsS3 hv r info c = Nothing -> nomore | otherwise = nomore where - nomore = return $ - mkImportableContentsUnversioned - (reverse (extractunversioned rsp:l)) + nomore = do + l' <- extractFromResourceT $ + extractunversioned rsp + return $ mkImportableContentsUnversioned + (reverse (l':l)) - continuelistversioned n h l rsp + continuelistversioned reuse h l rsp | S3.gbovrIsTruncated rsp = do rsp' <- sendS3Handle h $ (S3.getBucketObjectVersions (bucket info)) @@ -620,12 +622,14 @@ listImportableContentsS3 hv r info c = , S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp , S3.gbovPrefix = fileprefix } - l' <- extractFromResourceT $ - extractversioned rsp - continuelistversioned (length l' + n) h (l':l) rsp' - | otherwise = return $ - mkImportableContentsVersioned - (reverse (extractversioned rsp:l)) + (l', reuse') <- extractFromResourceT $ + extractversioned reuse rsp + continuelistversioned reuse' h (l':l) rsp' + | otherwise = do + (l', _) <- extractFromResourceT $ + extractversioned reuse rsp + return $ mkImportableContentsVersioned + (reverse (l':l)) extractunversioned = mapMaybe extractunversioned' . S3.gbrContents extractunversioned' oi = do @@ -635,14 +639,24 @@ listImportableContentsS3 hv r info c = let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi return (loc, (cid, sz)) - extractversioned = mapMaybe extractversioned' . S3.gbovrContents - extractversioned' ovi@(S3.ObjectVersion {}) = do + extractversioned reuse = extractversioned' reuse . S3.gbovrContents + extractversioned' reuse [] = ([], reuse) + extractversioned' reuse (x:xs) = case extractversioned'' reuse x of + Just (v, reuse') -> + let (l, reuse'') = extractversioned' reuse' xs + in (v:l, reuse'') + Nothing -> extractversioned' reuse xs + extractversioned'' reuse ovi@(S3.ObjectVersion {}) = do loc <- bucketImportLocation info $ T.unpack $ S3.oviKey ovi + -- Avoid storing the same filename in memory repeatedly. + let loc' = case reuse of + Just reuseloc | reuseloc == loc -> reuseloc + _ -> loc let sz = S3.oviSize ovi let cid = mkS3VersionedContentIdentifier' ovi - return ((loc, (cid, sz)), S3.oviLastModified ovi) - extractversioned' (S3.DeleteMarker {}) = Nothing + return (((loc', (cid, sz)), S3.oviLastModified ovi), Just loc') + extractversioned'' _ (S3.DeleteMarker {}) = Nothing mkImportableContentsUnversioned :: [[(ImportLocation, (ContentIdentifier, ByteSize))]] -> ImportableContents (ContentIdentifier, ByteSize) mkImportableContentsUnversioned l = ImportableContents