From 574514545cb4e577bf6b7e2c347efda002442df1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 Nov 2023 16:17:15 -0400 Subject: [PATCH] git-annex log --sizesof This can take a lot of memory. I decided to violate the usual rule in git-annex that it operate in constant memory no matter how many annexed objects. In this case, it would be hard to be fast without using a big map of the location logs. The main difficulty here is that there can be many git-annex branches and it needs to display a consistent view at a point in time, which means merging information from multiple git-annex branches. I have not checked if there are any laziness leaks in this code. It takes 1 gb to run in my big repo, which is around what I estimated before writing it. 2 options that are documented are not yet implemented. Small bug: With eg --when=1h, it will display at 12:00 then 1:10 if the next change after 12:59 is then. Then it waits until after 2:10 to display the next change. It ought to wait until after 2:00. Sponsored-by: Brock Spratlen on Patreon --- CHANGELOG | 2 + Command/Info.hs | 2 - Command/Log.hs | 177 ++++++++++++++++++++++++++++++++++++++-- Logs/Location.hs | 1 + doc/git-annex-info.mdwn | 4 +- doc/git-annex-log.mdwn | 90 +++++++++++++++----- 6 files changed, 244 insertions(+), 32 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 852f18688d..df257afc45 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -13,6 +13,8 @@ git-annex (10.20230927) UNRELEASED; urgency=medium avoid ending lines with CR for portability. Existing hook scripts that do have CR line endings will not be changed. * info: Added calculation of combined annex size of all repositories. + * log: Added options --sizesof, --sizes and --totalsizes that + display how the size of repositories changed over time. -- Joey Hess Tue, 10 Oct 2023 13:17:31 -0400 diff --git a/Command/Info.hs b/Command/Info.hs index d2dc50c776..3027572945 100644 --- a/Command/Info.hs +++ b/Command/Info.hs @@ -809,5 +809,3 @@ matchOnKey matcher k = matcher $ MatchingInfo $ ProvidedInfo , providedMimeEncoding = Nothing , providedLinkType = Nothing } - - diff --git a/Command/Log.hs b/Command/Log.hs index c737f1066c..7124019e31 100644 --- a/Command/Log.hs +++ b/Command/Log.hs @@ -5,7 +5,7 @@ - Licensed under the GNU AGPL version 3 or higher. -} -{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE OverloadedStrings, BangPatterns #-} module Command.Log where @@ -16,15 +16,21 @@ import Data.Time.Clock.POSIX import Data.Time import qualified Data.ByteString.Char8 as B8 import qualified System.FilePath.ByteString as P +import Control.Concurrent.Async import Command import Logs import Logs.Location +import Logs.UUID +import qualified Logs.Presence.Pure as PLog import qualified Annex import qualified Annex.Branch import qualified Remote import qualified Git import Git.Log +import Git.CatFile +import Utility.DataUnits +import Utility.HumanTime data LogChange = Added | Removed @@ -38,7 +44,10 @@ cmd = withAnnexOptions [jsonOptions, annexedMatchingOptions] $ data LogOptions = LogOptions { logFiles :: CmdParams , allOption :: Bool + , sizesOfOption :: Maybe (DeferredParse UUID) + , whenOption :: Maybe Duration , rawDateOption :: Bool + , bytesOption :: Bool , gourceOption :: Bool , passthruOptions :: [CommandParam] } @@ -51,10 +60,24 @@ optParser desc = LogOptions <> short 'A' <> help "display location log changes to all files" ) + <*> optional ((parseUUIDOption <$> strOption + ( long "sizesof" + <> metavar (paramRemote `paramOr` paramDesc `paramOr` paramUUID) + <> help "display history of sizes of this repository" + <> completeRemotes + ))) + <*> optional (option (eitherReader parseDuration) + ( long "when" <> metavar paramTime + <> help "when to display changed size" + )) <*> switch ( long "raw-date" <> help "display seconds from unix epoch" ) + <*> switch + ( long "bytes" + <> help "display sizes in bytes" + ) <*> switch ( long "gource" <> help "format output for gource" @@ -78,7 +101,14 @@ optParser desc = LogOptions seek :: LogOptions -> CommandSeek seek o = ifM (null <$> Annex.Branch.getUnmergedRefs) - ( do + ( maybe (pure Nothing) (Just <$$> getParsed) (sizesOfOption o) >>= \case + Just u -> sizeHistoryInfo (Just u) o + Nothing -> go + , giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents displaying location log changes. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)" + ) + where + ww = WarnUnmatchLsFiles "log" + go = do m <- Remote.uuidDescriptions zone <- liftIO getCurrentTimeZone outputter <- mkOutputter m zone o <$> jsonOutputEnabled @@ -94,10 +124,6 @@ seek o = ifM (null <$> Annex.Branch.getUnmergedRefs) =<< workTreeItems ww fs ([], True) -> commandAction (startAll o outputter) (_, True) -> giveup "Cannot specify both files and --all" - , giveup "This repository is read-only, and there are unmerged git-annex branches, which prevents displaying location log changes. (Set annex.merge-annex-branches to false to ignore the unmerged git-annex branches.)" - ) - where - ww = WarnUnmatchLsFiles "log" start :: LogOptions -> (ActionItem -> SeekInput -> Outputter) -> SeekInput -> RawFilePath -> Key -> CommandStart start o outputter si file key = do @@ -158,7 +184,7 @@ mkOutputter m zone o jsonenabled ai si | jsonenabled = jsonOutput m ai si | rawDateOption o = normalOutput lookupdescription ai rawTimeStamp | gourceOption o = gourceOutput lookupdescription ai - | otherwise = normalOutput lookupdescription ai (showTimeStamp zone) + | otherwise = normalOutput lookupdescription ai (showTimeStamp zone rfc822DateFormat) where lookupdescription u = maybe (fromUUID u) (fromUUIDDesc) (M.lookup u m) @@ -242,9 +268,142 @@ getGitLogAnnex fs os = do let fileselector = locationLogFileKey config . toRawFilePath inRepo $ getGitLog Annex.Branch.fullname fs os fileselector -showTimeStamp :: TimeZone -> POSIXTime -> String -showTimeStamp zone = formatTime defaultTimeLocale rfc822DateFormat +showTimeStamp :: TimeZone -> String -> POSIXTime -> String +showTimeStamp zone format = formatTime defaultTimeLocale format . utcToZonedTime zone . posixSecondsToUTCTime rawTimeStamp :: POSIXTime -> String rawTimeStamp t = filter (/= 's') (show t) + +sizeHistoryInfo :: (Maybe UUID) -> LogOptions -> Annex () +sizeHistoryInfo mu o = do + zone <- liftIO getCurrentTimeZone + let dispst = (zone, False, epoch, Nothing) + uuidmap <- getuuidmap + (l, cleanup) <- getlog + g <- Annex.gitRepo + liftIO $ catObjectStream g $ \feeder closer reader -> do + tid <- async $ do + forM_ l $ \c -> + feeder ((changed c, changetime c), newref c) + closer + go reader M.empty M.empty M.empty uuidmap dispst + wait tid + void $ liftIO cleanup + where + -- Go through the log of the git-annex branch in reverse, + -- and in date order, and pick out changes to location log files + -- and to the trust log. + getlog = do + config <- Annex.getGitConfig + let fileselector = \f -> let f' = toRawFilePath f in + case locationLogFileKey config f' of + Just k -> Just (Right k) + Nothing + | f' == trustLog -> Just (Left ()) + | otherwise -> Nothing + inRepo $ getGitLog Annex.Branch.fullname [] + [ Param "--date-order" + , Param "--reverse" + ] + fileselector + + go reader sizemap locmap deadmap uuidmap dispst = reader >>= \case + Just ((Right k, t), Just logcontent) -> do + let !newlog = parselocationlog logcontent uuidmap + let !(sizemap', locmap') = case M.lookup k locmap of + Nothing -> addnew k sizemap locmap newlog + Just v -> update k sizemap locmap v newlog + dispst' <- displaysizes dispst uuidmap sizemap' t + go reader sizemap' locmap' deadmap uuidmap dispst' + Just ((Left (), t), Just logcontent) -> do + -- XXX todo update deadmap + go reader sizemap locmap deadmap uuidmap dispst + Just (_, Nothing) -> + go reader sizemap locmap deadmap uuidmap dispst + Nothing -> + displayendsizes dispst + + -- Known uuids are stored in this map, and when uuids are stored in the + -- state, it's a value from this map. This avoids storing multiple + -- copies of the same uuid in memory. + getuuidmap = do + us <- M.keys <$> uuidDescMap + return $ M.fromList (zip us us) + + -- Parses a location log file, and replaces the logged uuid + -- with one from the uuidmap. + parselocationlog logcontent uuidmap = + map replaceuuid $ PLog.parseLog logcontent + where + replaceuuid ll = + let !u = toUUID $ PLog.fromLogInfo $ PLog.info ll + !ushared = fromMaybe u $ M.lookup u uuidmap + in ll { PLog.info = PLog.LogInfo (fromUUID ushared) } + + presentlocs = map (toUUID . PLog.fromLogInfo . PLog.info) + . PLog.filterPresent + + -- Since the git log is being traversed in date order, commits + -- from different branches can appear one after the other, and so + -- the newlog is not necessarily the complete state known at that + -- time across all git-annex repositories. + -- + -- This combines the new location log with what has been + -- accumulated so far, which is equivilant to merging together + -- all git-annex branches at that point in time. + update k sizemap locmap (oldlog, oldlocs) newlog = + ( updatesize (updatesize sizemap sz (S.toList addedlocs)) + (negate sz) (S.toList removedlocs) + , M.insert k (combinedlog, combinedlocs) locmap + ) + where + sz = ksz k + combinedlog = PLog.compactLog (oldlog ++ newlog) + combinedlocs = S.fromList (presentlocs combinedlog) + addedlocs = S.difference combinedlocs oldlocs + removedlocs = S.difference oldlocs combinedlocs + + addnew k sizemap locmap newlog = + ( updatesize sizemap (ksz k) locs + , M.insert k (newlog, S.fromList locs) locmap + ) + where + locs = presentlocs newlog + + ksz k = fromMaybe 0 (fromKey keySize k) + + updatesize sizemap _ [] = sizemap + updatesize sizemap sz (l:ls) = + updatesize (M.insertWith (+) l sz sizemap) sz ls + + epoch = toEnum 0 + + displaysizes (zone, displayedyet, prevt, prevoutput) uuidmap sizemap t + | t - prevt > dt + && (displayedyet || any (/= 0) sizes) + && (prevoutput /= Just output) = do + displayts zone t output + return (zone, True, t, Just output) + | otherwise = return (zone, displayedyet, prevt, Just output) + where + output = intercalate ", " (map showsize sizes) + us = case mu of + Just u -> [u] + Nothing -> M.keys uuidmap + sizes = map (\u -> fromMaybe 0 (M.lookup u sizemap)) us + dt = maybe 1 durationToPOSIXTime (whenOption o) + + displayts zone t output = putStrLn $ ts ++ ", " ++ output + where + ts = if rawDateOption o + then rawTimeStamp t + else showTimeStamp zone "%Y-%m-%dT%H:%M:%S" t + + displayendsizes (zone , _, t, Just output) = + displayts zone t output + displayendsizes _ = return () + + showsize n + | bytesOption o = show n + | otherwise = roughSize storageUnits True n diff --git a/Logs/Location.hs b/Logs/Location.hs index e9ae0213a3..880bfb2b2e 100644 --- a/Logs/Location.hs +++ b/Logs/Location.hs @@ -23,6 +23,7 @@ module Logs.Location ( loggedLocations, loggedLocationsHistorical, loggedLocationsRef, + parseLoggedLocations, isKnownKey, checkDead, setDead, diff --git a/doc/git-annex-info.mdwn b/doc/git-annex-info.mdwn index 2b024f31a7..bdbcf1415e 100644 --- a/doc/git-annex-info.mdwn +++ b/doc/git-annex-info.mdwn @@ -12,8 +12,8 @@ Displays statistics and other information for the specified item. When no item is specified, displays overall information. This includes a list of all known repositories, how much annexed data is present in the -local repository, the total size of all annexed data in the working -tree, and the combined size of all annexed data in all known repositories. +local repository, and the total size of all annexed data in the working +tree. When a directory is specified, displays information about the annexed files in that directory (and subdirectories). diff --git a/doc/git-annex-log.mdwn b/doc/git-annex-log.mdwn index 0361dac306..966eb6aa8e 100644 --- a/doc/git-annex-log.mdwn +++ b/doc/git-annex-log.mdwn @@ -1,6 +1,6 @@ # NAME -git-annex log - shows location log +git-annex log - shows location log information # SYNOPSIS @@ -8,21 +8,68 @@ git annex log `[path ...]` # DESCRIPTION -Displays the location log for the specified file or files, showing each -repository they were added to ("+") and removed from ("-"). Note that the -location log is for the particular file contents currently at these paths, -not for any different content that was there in earlier commits. +This command displays information from the history of the git-annex branch. -This displays information from the history of the git-annex branch. Several -things can prevent that information being available to display. When -[[git-annex-dead]] and [[git-annex-forget]] are used, old historical -data gets cleared from the branch. When annex.private or +Several things can prevent that information being available to display. +When [[git-annex-dead]] and [[git-annex-forget]] are used, old historical +data gets cleared from the branch. When annex.private or remote.name.annex-private is configured, git-annex does not write information to the branch at all. And when annex.alwayscommit is set to false, information may not have been committed to the branch yet. # OPTIONS +* `[path ...]` + + Displays the location log for the specified file or files, showing each + repository they were added to ("+") and removed from ("-"). Note that + it displays information about the file content currently at these paths, + not for any different content that was there in earlier commits. + +* matching options + + The [[git-annex-matching-options]](1) + can be used to control what to act on when displaying the location log + for specified files. + +* `--all` `-A` + + Shows location log changes to all content, with the most recent changes first. + In this mode, the names of files are not available and keys are displayed + instead. + +* `--sizesof=repository` + + Displays a history of the size of the annexed files in a repository as it + changed over time from the creation of the repository to the present. + + The repository can be "here" for the current repository, or the name of a + remote, or a repository description or uuid. + + Note that keys that do not have a known size are skipped. + +* `--sizes` + + This is like --sizesof, but rather than display the size of a single + repository, it displays the sizes of all known repositories in a table. + +* `--totalsizes` + + This is like `--sizesof`, but it displays the total size of all + known repositories. + + Note that dead repositories have their size included in the total + for times before the point they were marked dead. Once marked dead, + their size will no longer be included in the total. + +* `--when=time` + + When using `--sizesof`, `--sizes`, and `--totalsizes`, this + controls how often to display the size. The default is to + display each change to the size. + + The time is of the form "30d" or "1y". + * `--since=date`, `--after=date`, `--until=date`, `--before=date`, `--max-count=N` These options are passed through to `git log`, and can be used to limit @@ -30,6 +77,13 @@ false, information may not have been committed to the branch yet. For example: `--since "1 month ago"` + These options do not have an affect when using `--sizesof`, `--sizes`, + and `--totalsizes`. + +* `--bytes` + + Show sizes in bytes, disabling the default nicer units. + * `--raw-date` Rather than the normal display of a date in the local time zone, @@ -38,27 +92,25 @@ false, information may not have been committed to the branch yet. * `--gource` Generates output suitable for the `gource` visualization program. + + This option does not have an affect when using `--sizesof`, `--sizes`, + and `--totalsizes`. * `--json` Enable JSON output. This is intended to be parsed by programs that use git-annex. Each line of output is a JSON object. + + This option does not have an affect when using `--sizesof`, `--sizes`, + and `--totalsizes`. * `--json-error-messages` Messages that would normally be output to standard error are included in the JSON instead. - -* matching options - The [[git-annex-matching-options]](1) - can be used to control what to act on. - -* `--all` `-A` - - Shows location log changes to all content, with the most recent changes first. - In this mode, the names of files are not available and keys are displayed - instead. + This option does not have an affect when using `--sizesof`, `--sizes`, + and `--totalsizes`. * Also the [[git-annex-common-options]](1) can be used.