listImportableContents filtering to wanted files

This could in theory allow importing subsets of files with less memory
use. Rather than building up a big import list and then filtering it to
a smaller list of wanted files, support optionally filtering wanted
files first.

So far, the directory special remote implements it and will probably use
less memory. (Since dirContentsRecursiveSkipping does lazy streaming.)

Implementation in Remote.S3 is incomplete and fails to compile. Bit of a
mess with ResourceT needing to use Annex.

Also, in Remote.S3, filtering is not done for old versions.
And mkImportableContentsUnversioned is doing now redundant work
to filterwanted.
This commit is contained in:
Joey Hess 2023-12-20 15:48:26 -04:00
parent d7ca716759
commit 41edf73789
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
7 changed files with 112 additions and 50 deletions

View file

@ -293,7 +293,7 @@ buildImportTrees basetree msubdir (ImportedDiff (LastImportedTree oldtree) impor
let (removed, new) = partition isremoved
(importableContents imported)
newtreeitems <- catMaybes <$> mapM mktreeitem new
let removedfiles = map (mkloc . fst) removed
let removedfiles = map (\(_, loc, _) -> mkloc loc) removed
inRepo $ adjustTree
(pure . Just)
-- ^ keep files that are not added/removed the same
@ -303,14 +303,14 @@ buildImportTrees basetree msubdir (ImportedDiff (LastImportedTree oldtree) impor
removedfiles
oldtree
mktreeitem (loc, DiffChanged v) =
mktreeitem (_, loc, DiffChanged v) =
Just <$> mkImportTreeItem msubdir loc v
mktreeitem (_, DiffRemoved) =
mktreeitem (_, _, DiffRemoved) =
pure Nothing
mkloc = asTopFilePath . fromImportLocation
isremoved (_, v) = v == DiffRemoved
isremoved (_, _, v) = v == DiffRemoved
convertImportTree :: Maybe TopFilePath -> [(ImportLocation, Either Sha Key)] -> Annex Tree
convertImportTree msubdir ls =
@ -429,11 +429,12 @@ buildImportTree
:: (Maybe TopFilePath -> [(ImportLocation, v)] -> Annex Tree)
-> Ref
-> Maybe TopFilePath
-> [(ImportLocation, v)]
-> [(ImportWantedChecked, ImportLocation, v)]
-> MkTreeHandle
-> Annex Sha
buildImportTree converttree basetree msubdir ls hdl = do
importtree <- liftIO . recordTree' hdl =<< converttree msubdir ls
importtree <- liftIO . recordTree' hdl
=<< converttree msubdir (map (\(_, loc, v) -> (loc, v)) ls)
graftImportTree basetree msubdir importtree hdl
graftImportTree
@ -545,7 +546,7 @@ importChanges remote importtreeconfig importcontent thirdpartypopulated importab
let (removed, changed) = partition isremoval diff
let mkicchanged ti = do
v <- M.lookup (Git.DiffTree.dstsha ti) cidtreemap
return (mkloc ti, v)
return (ImportWantedChecked False, mkloc ti, v)
let ic = ImportableContentsComplete $ ImportableContents
{ importableContents = mapMaybe mkicchanged changed
, importableHistory = []
@ -576,10 +577,10 @@ importChanges remote importtreeconfig importcontent thirdpartypopulated importab
}
where
diffchanged = map
(\(loc, v) -> (loc, DiffChanged v))
(\(wantedchecked, loc, v) -> (wantedchecked, loc, DiffChanged v))
(importableContents ic)
diffremoved = map
(\ti -> (mkloc ti, DiffRemoved))
(\ti -> (ImportWantedChecked False, mkloc ti, DiffRemoved))
removed
{- Gets the tree that was last imported from the remote
@ -671,7 +672,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
then return Nothing
else gohistory largematcher cidmap importing db h >>= return . \case
Nothing -> Nothing
Just h' -> Just $ ImportableContents (catMaybes l') h'
Just h' -> Just $ ImportableContents (map (\(loc, k) -> (ImportWantedChecked False, loc, k)) $ catMaybes l') h'
gohistory largematcher cidmap importing db h = do
h' <- mapM (go True largematcher cidmap importing db) h
@ -689,7 +690,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
| otherwise = do
l <- forM (importableContentsSubTree c) $ \(loc, i) -> do
let loc' = importableContentsChunkFullLocation (importableContentsSubDir c) loc
thirdpartypopulatedimport db (loc', i) >>= return . \case
thirdpartypopulatedimport db (ImportWantedChecked False, loc', i) >>= return . \case
Just (_loc, k) -> Just (loc, k)
Nothing -> Nothing
return $ ImportableContentsChunk
@ -713,7 +714,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
s <- readTVar importing
writeTVar importing $ S.delete cid s
startimport cidmap importing db i@(loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case
startimport cidmap importing db i@(_, loc, (cid, _sz)) oldversion largematcher = getcidkey cidmap db cid >>= \case
(k:ks) ->
-- If the same content was imported before
-- yielding multiple different keys, it's not clear
@ -748,7 +749,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
importaction
return (Right job)
thirdpartypopulatedimport db (loc, (cid, sz)) =
thirdpartypopulatedimport db (_, loc, (cid, sz)) =
case Remote.importKey ia of
Nothing -> return Nothing
Just importkey ->
@ -762,7 +763,7 @@ importKeys remote importtreeconfig importcontent thirdpartypopulated importablec
warning (UnquotedString (show e))
return Nothing
importordownload cidmap (loc, (cid, sz)) largematcher = do
importordownload cidmap (_, loc, (cid, sz)) largematcher = do
f <- locworktreefile loc
matcher <- largematcher f
-- When importing a key is supported, always use it rather
@ -1024,18 +1025,17 @@ pruneImportMatcher = Utility.Matcher.pruneMatcher matchNeedsKey
-}
getImportableContents :: Remote -> ImportTreeConfig -> CheckGitIgnore -> FileMatcher Annex -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
getImportableContents r importtreeconfig ci matcher = do
Remote.listImportableContents (Remote.importActions r) >>= \case
Just (ImportableContentsComplete ic) -> do
dbhandle <- opendbhandle
dbhandle <- opendbhandle
Remote.listImportableContents (Remote.importActions r) (wanted dbhandle) >>= \case
Just (ImportableContentsComplete ic) ->
Just . ImportableContentsComplete
<$> filterunwanted dbhandle ic
Just (c@(ImportableContentsChunked {})) -> do
dbhandle <- opendbhandle
Just (c@(ImportableContentsChunked {})) ->
Just <$> filterunwantedchunked dbhandle c
Nothing -> return Nothing
where
filterunwanted dbhandle ic = ImportableContents
<$> filterM (wanted dbhandle) (importableContents ic)
<$> filterM (wanted' dbhandle) (importableContents ic)
<*> mapM (filterunwanted dbhandle) (importableHistory ic)
filterunwantedchunked dbhandle c = ImportableContentsChunked
@ -1057,7 +1057,11 @@ getImportableContents r importtreeconfig ci matcher = do
void $ Export.updateExportTreeFromLog h
return h
wanted dbhandle (loc, (_cid, sz))
wanted dbhandle loc sz = ImportWantedChecked
<$> wanted' dbhandle (ImportWantedChecked False, loc, ((), sz))
wanted' dbhandle (ImportWantedChecked alreadychecked, loc, (_cid, sz))
| alreadychecked = pure True
| ingitdir = pure False
| otherwise =
isknown <||> (matches <&&> notignored)
@ -1069,7 +1073,7 @@ getImportableContents r importtreeconfig ci matcher = do
notignored = notIgnoredImportLocation importtreeconfig ci loc
wantedunder dbhandle root (loc, v) =
wanted dbhandle (importableContentsChunkFullLocation root loc, v)
wanted' dbhandle (ImportWantedChecked False, importableContentsChunkFullLocation root loc, v)
isKnownImportLocation :: Export.ExportHandle -> ImportLocation -> Annex Bool
isKnownImportLocation dbhandle loc = liftIO $

View file

@ -294,8 +294,13 @@ renameExportM serial adir _k old new = do
, File newloc
]
listImportableContentsM :: AndroidSerial -> AndroidPath -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM serial adir c = adbfind >>= \case
listImportableContentsM
:: AndroidSerial
-> AndroidPath
-> ParsedRemoteConfig
-> ImportWantedChecker Annex
-> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM serial adir c _wanted = adbfind >>= \case
Just ls -> return $ Just $ ImportableContentsComplete $
ImportableContents (mapMaybe mk ls) []
Nothing -> giveup "adb find failed"
@ -344,7 +349,7 @@ listImportableContentsM serial adir c = adbfind >>= \case
cid = ContentIdentifier (encodeBS stat)
loc = mkImportLocation $ toRawFilePath $
Posix.makeRelative (fromAndroidPath adir) fn
in Just (loc, (cid, sz))
in Just (ImportWantedChecked False, loc, (cid, sz))
mk _ = Nothing
-- This does not guard against every possible race. As long as the adb

View file

@ -168,8 +168,13 @@ checkAvailability :: BorgRepo -> Annex Availability
checkAvailability borgrepo@(BorgRepo r) =
checkPathAvailability (borgLocal borgrepo) r
listImportableContentsM :: UUID -> BorgRepo -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM u borgrepo c = prompt $ do
listImportableContentsM
:: UUID
-> BorgRepo
-> ParsedRemoteConfig
-> ImportWantedChecker Annex
-> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM u borgrepo c _wanted = prompt $ do
imported <- getImported u
ls <- withborglist (locBorgRepo borgrepo) Nothing formatarchivelist $ \as ->
forM (filter (not . S.null) as) $ \archivename ->

View file

@ -382,21 +382,28 @@ removeExportLocation topdir loc =
mkExportLocation loc'
in go (upFrom loc') =<< tryIO (removeDirectory p)
listImportableContentsM :: IgnoreInodes -> RawFilePath -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM ii dir = liftIO $ do
l <- dirContentsRecursiveSkipping (const False) False (fromRawFilePath dir)
listImportableContentsM
:: IgnoreInodes
-> RawFilePath
-> ImportWantedChecker Annex
-> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsM ii dir wanted = do
l <- liftIO $ dirContentsRecursiveSkipping (const False) False (fromRawFilePath dir)
l' <- mapM (go . toRawFilePath) l
return $ Just $ ImportableContentsComplete $
ImportableContents (catMaybes l') []
where
go f = do
st <- R.getSymbolicLinkStatus f
mkContentIdentifier ii f st >>= \case
Nothing -> return Nothing
Just cid -> do
relf <- relPathDirToFile dir f
sz <- getFileSize' f st
return $ Just (mkImportLocation relf, (cid, sz))
relf <- liftIO $ relPathDirToFile dir f
st <- liftIO $ R.getSymbolicLinkStatus f
sz <- liftIO $ getFileSize' f st
let loc = mkImportLocation relf
wanted loc sz >>= \case
ImportWantedChecked False -> return Nothing
checked@(ImportWantedChecked True) ->
liftIO (mkContentIdentifier ii f st) >>= return . \case
Nothing -> Nothing
Just cid -> Just (checked, loc, (cid, sz))
newtype IgnoreInodes = IgnoreInodes Bool

View file

@ -34,6 +34,7 @@ import Network.HTTP.Types
import Network.URI
import Control.Monad.Trans.Resource
import Control.Monad.Catch
import Control.Monad.Trans
import Control.Concurrent.STM (atomically)
import Control.Concurrent.STM.TVar
import Data.Maybe
@ -568,43 +569,68 @@ renameExportS3 hv r rs info k src dest = Just <$> go
srcobject = T.pack $ bucketExportLocation info src
dstobject = T.pack $ bucketExportLocation info dest
listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsS3 hv r info c =
listImportableContentsS3
:: S3HandleVar
-> Remote
-> S3Info
-> ParsedRemoteConfig
-> ImportWantedChecker Annex
-> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
listImportableContentsS3 hv r info c wanted =
withS3Handle hv $ \case
Right h -> Just <$> go h
Left p -> giveupS3HandleProblem p (uuid r)
where
go :: S3Handle -> Annex (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize))
go h = do
ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h
ic <- runResourceT $ extractFromResourceT =<< startlist h
return (ImportableContentsComplete ic)
fileprefix = T.pack <$> getRemoteConfigValue fileprefixField c
startlist :: S3Handle -> ResourceT Annex (ImportableContents (ContentIdentifier, ByteSize))
startlist h
| versioning info = do
rsp <- sendS3Handle h $
rsp <- lift $ sendS3Handle h $
S3.getBucketObjectVersions (bucket info)
continuelistversioned h [] rsp
| otherwise = do
rsp <- sendS3Handle h $
rsp <- lift $ sendS3Handle h $
(S3.getBucket (bucket info))
{ S3.gbPrefix = fileprefix }
continuelistunversioned h [] rsp
continuelistunversioned :: S3Handle -> [S3.GetBucketResponse] -> S3.GetBucketResponse -> ResourceT Annex (ImportableContents (ContentIdentifier, ByteSize))
continuelistunversioned h l rsp
| S3.gbrIsTruncated rsp = do
rsp' <- sendS3Handle h $
rsp' <- lift $ sendS3Handle h $
(S3.getBucket (bucket info))
{ S3.gbMarker = S3.gbrNextMarker rsp
, S3.gbPrefix = fileprefix
}
-- wantedrsp <- filterwanted rsp
continuelistunversioned h (rsp:l) rsp'
| otherwise = return $
mkImportableContentsUnversioned info (reverse (rsp:l))
| otherwise = do
--wantedrsp <- filterwanted rsp
return $
mkImportableContentsUnversioned info (reverse (rsp:l))
filterwanted rsp = filterwanted' [] (S3.gbrContents rsp)
filterwanted' c [] = pure c
filterwanted' c (oi:ois) =
case bucketImportLocation info $ T.unpack $ S3.objectKey oi of
Nothing -> filterwanted' c ois
Just loc -> do
let sz = S3.objectSize oi
let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi
wanted loc sz >>= \case
ImportWantedChecked False -> filterwanted' c ois
checked@(ImportWantedChecked True) ->
filterwanted' ((checked, loc, (cid, sz)):c) ois
continuelistversioned h l rsp
| S3.gbovrIsTruncated rsp = do
rsp' <- sendS3Handle h $
rsp' <- lift $ sendS3Handle h $
(S3.getBucketObjectVersions (bucket info))
{ S3.gbovKeyMarker = S3.gbovrNextKeyMarker rsp
, S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp
@ -625,7 +651,7 @@ mkImportableContentsUnversioned info l = ImportableContents
T.unpack $ S3.objectKey oi
let sz = S3.objectSize oi
let cid = mkS3UnversionedContentIdentifier $ S3.objectETag oi
return (loc, (cid, sz))
return (ImportWantedChecked False, loc, (cid, sz))
mkImportableContentsVersioned :: S3Info -> [S3.GetBucketObjectVersionsResponse] -> ImportableContents (ContentIdentifier, ByteSize)
mkImportableContentsVersioned info = build . groupfiles
@ -645,7 +671,7 @@ mkImportableContentsVersioned info = build . groupfiles
T.unpack $ S3.oviKey ovi
let sz = S3.oviSize ovi
let cid = mkS3VersionedContentIdentifier' ovi
return (loc, (cid, sz))
return (ImportWantedChecked False, loc, (cid, sz))
extract (S3.DeleteMarker {}) = Nothing
-- group files so all versions of a file are in a sublist,

View file

@ -18,6 +18,7 @@ import qualified System.FilePath.Posix.ByteString as Posix
import Types.Export
import Utility.QuickCheck
import Utility.FileSystemEncoding
import Utility.DataUnits
{- Location of content on a remote that can be imported.
- This is just an alias to ExportLocation, because both are referring to a
@ -55,7 +56,7 @@ instance Arbitrary ContentIdentifier where
{- List of files that can be imported from a remote, each with some added
- information. -}
data ImportableContents info = ImportableContents
{ importableContents :: [(ImportLocation, info)]
{ importableContents :: [(ImportWantedChecked, ImportLocation, info)]
, importableHistory :: [ImportableContents info]
-- ^ Used by remotes that support importing historical versions of
-- files that are stored in them. This is equivalent to a git
@ -103,3 +104,17 @@ importableContentsChunkFullLocation
-> ImportLocation
importableContentsChunkFullLocation (ImportChunkSubDir root) loc =
mkImportLocation $ Posix.combine root loc
newtype ImportWantedChecked = ImportWantedChecked Bool
deriving (Show, Generic)
instance NFData ImportWantedChecked
{- This action may optinally be used to check if a file will be wanted in
- the import when constructing the ImportableContents.
- Filtering out unwanted files at that point makes the import use less
- memory, but is optional. If ImportWantedChecked False is used instead,
- it will be checked in a later pass.
-}
type ImportWantedChecker a = ImportLocation -> ByteSize -> a ImportWantedChecked

View file

@ -314,7 +314,7 @@ data ImportActions a = ImportActions
--
-- Throws exception on failure to access the remote.
-- May return Nothing when the remote is unchanged since last time.
{ listImportableContents :: a (Maybe (ImportableContentsChunkable a (ContentIdentifier, ByteSize)))
{ listImportableContents :: ImportWantedChecker a -> a (Maybe (ImportableContentsChunkable a (ContentIdentifier, ByteSize)))
-- Generates a Key (of any type) for the file stored on the
-- remote at the ImportLocation. Does not download the file
-- from the remote.