check pointer file size

This is all good, except for one small problem... When a pointer file has to be fed into the metadata cat-file, it's possible for a non-pointer file that comes after it to get fed into the main cat-file first, so the two files will be processed in a different order than the user specified. So, while this is the fast way, I guess I'll have to change it to be slower, but sequential..
2020-07-10 15:11:14 -04:00 · 2020-07-10 15:11:14 -04:00 · 0f6b1ee048
commit 0f6b1ee048
parent 5387b95dcd
1 changed files with 41 additions and 22 deletions
--- a/CmdLine/Seek.hs
+++ b/CmdLine/Seek.hs
@ -19,6 +19,7 @@ import qualified Git
 import qualified Git.Command
 import qualified Git.LsFiles as LsFiles
 import qualified Git.LsTree as LsTree
+import qualified Git.Types as Git
 import Git.FilePath
 import qualified Limit
 import CmdLine.GitAnnex.Options
@ -229,7 +230,7 @@ withKeyOptions' ko auto mkkeyaction fallbackaction params = do
 						keyaction (k, mkActionItem k)
 					Nothing -> return ()
 				go reader
-		catObjectStream l (isJust . getk . LsTree.file) g go
+		catObjectStreamLsTree l (isJust . getk . LsTree.file) g go
 		liftIO $ void cleanup

 	runkeyaction getks = do
@ -264,37 +265,55 @@ seekFiltered a fs = do
 	process matcher f =
 		whenM (matcher $ MatchingFile $ FileInfo f f) $ a f

-- This is siginificantly faster than using lookupKey after seekFiltered.
+-- This is significantly faster than using lookupKey after seekFiltered.
 seekFilteredKeys :: (RawFilePath -> Key -> CommandSeek) -> Annex [(RawFilePath, Git.Sha, FileMode)] -> Annex ()
 seekFilteredKeys a fs = do
 	g <- Annex.gitRepo
-	catObjectStream' g $ \feeder closer reader -> do
-		tid <- liftIO . async =<< forkState (gofeed feeder closer)
-		goread reader
-		join (liftIO (wait tid))
+	matcher <- Limit.getMatcher
+	catObjectMetaDataStream g $ \mdfeeder mdcloser mdreader ->
+		catObjectStream g $ \feeder closer reader -> do
+			processertid <- liftIO . async =<< forkState
+				(gofeed matcher feeder closer mdfeeder mdcloser)
+			mdprocessertid <- liftIO . async =<< forkState
+				(mdprocess matcher mdreader feeder)
+			goread reader
+			join (liftIO (wait mdprocessertid))
+			join (liftIO (wait processertid))
  where
-	gofeed feeder closer = do
-		matcher <- Limit.getMatcher
+	gofeed matcher feeder closer mdfeeder mdcloser = do
 		l <- fs
-		forM_ l $ process matcher feeder
-		liftIO closer
-	
-	process matcher feeder (f, sha, mode) =
-		-- TODO handle non-symlink separately to avoid
-		-- catting large files
-		-- If the matcher needs to look up a key, it should be run
-		-- in goread, not here, and the key passed in. OTOH, if
-		-- the matcher does not need to look up a key, it's more
-		-- efficient to put it here, to avoid catting files that
-		-- will not be matched.
-		whenM (matcher $ MatchingFile $ FileInfo f f) $
-			liftIO $ feeder (f, sha)
+		forM_ l $ process matcher feeder mdfeeder
+		liftIO $ void closer
+		liftIO $ void mdcloser

 	goread reader = liftIO reader >>= \case
 		Just (f, content) -> do
 			maybe noop (a f) (parseLinkTargetOrPointerLazy =<< content)
 			goread reader
-		_ -> return ()
+		Nothing -> return ()
+	
+	feedmatches matcher feeder f sha = 
+		whenM (matcher $ MatchingFile $ FileInfo f f) $
+			liftIO $ feeder (f, sha)
+
+	process matcher feeder mdfeeder (f, sha, mode) = case
+		Git.toTreeItemType mode of
+			Just Git.TreeSymlink -> 
+				feedmatches matcher feeder f sha
+			Just Git.TreeSubmodule -> return ()
+			-- Might be a pointer file, might be other
+			-- file in git, possibly large. Avoid catting
+			-- large files by first looking up the size.
+			Just _ -> liftIO $ mdfeeder (f, sha)
+			Nothing -> return ()
+
+	mdprocess matcher mdreader feeder = liftIO mdreader >>= \case
+		Just (f, Just (sha, size, _type))
+			| size < maxPointerSz -> do
+				feedmatches matcher feeder f sha
+				mdprocess matcher mdreader feeder
+		Just _ -> mdprocess matcher mdreader feeder
+		Nothing -> return ()

 seekHelper :: (a -> RawFilePath) -> WarnUnmatchWhen -> ([LsFiles.Options] -> [RawFilePath] -> Git.Repo -> IO ([a], IO Bool)) -> [WorkTreeItem] -> Annex [a]
 seekHelper c ww a l = do