From 9445556c97462946c8ca6aeeca98644a78de4f7c Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Sun, 5 Apr 2015 12:50:02 -0400 Subject: [PATCH] rethought distributed fsck; instead add activity.log and expire command This is much more space efficient! --- CmdLine/GitAnnex.hs | 2 + Command/Expire.hs | 100 +++++++++++++++++++++++++++++++++ Command/Fsck.hs | 114 ++++++-------------------------------- Logs.hs | 7 ++- Logs/Activity.hs | 37 +++++++++++++ Logs/Location.hs | 1 - Logs/Presence.hs | 4 -- debian/changelog | 6 +- doc/git-annex-expire.mdwn | 61 ++++++++++++++++++++ doc/git-annex-fsck.mdwn | 38 ------------- doc/git-annex.mdwn | 5 ++ doc/internals.mdwn | 8 +++ 12 files changed, 239 insertions(+), 144 deletions(-) create mode 100644 Command/Expire.hs create mode 100644 Logs/Activity.hs create mode 100644 doc/git-annex-expire.mdwn diff --git a/CmdLine/GitAnnex.hs b/CmdLine/GitAnnex.hs index 6dedd94f5c..6aeefda054 100644 --- a/CmdLine/GitAnnex.hs +++ b/CmdLine/GitAnnex.hs @@ -45,6 +45,7 @@ import qualified Command.Describe import qualified Command.InitRemote import qualified Command.EnableRemote import qualified Command.Fsck +import qualified Command.Expire import qualified Command.Repair import qualified Command.Unused import qualified Command.DropUnused @@ -169,6 +170,7 @@ cmds = concat , Command.VCycle.cmd , Command.Fix.cmd , Command.Fsck.cmd + , Command.Expire.cmd , Command.Repair.cmd , Command.Unused.cmd , Command.DropUnused.cmd diff --git a/Command/Expire.hs b/Command/Expire.hs new file mode 100644 index 0000000000..d01d3a9650 --- /dev/null +++ b/Command/Expire.hs @@ -0,0 +1,100 @@ +{- git-annex command + - + - Copyright 2015 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Command.Expire where + +import Common.Annex +import Command +import Logs.Activity +import Logs.UUID +import Logs.MapLog +import Logs.Trust +import Annex.UUID +import qualified Remote +import Utility.HumanTime + +import Data.Time.Clock.POSIX +import qualified Data.Map as M + +cmd :: [Command] +cmd = [withOptions [activityOption] $ command "expire" paramExpire seek + SectionMaintenance "expire inactive repositories"] + +paramExpire :: String +paramExpire = (paramRepeating $ paramOptional paramRemote ++ ":" ++ paramTime) + +activityOption :: Option +activityOption = fieldOption [] "activity" "Name" "specify activity" + +seek :: CommandSeek +seek ps = do + expire <- parseExpire ps + wantact <- getOptionField activityOption (pure . parseActivity) + actlog <- lastActivities wantact + u <- getUUID + us <- filter (/= u) . M.keys <$> uuidMap + descs <- uuidMap + seekActions $ pure $ map (start expire actlog descs) us + +start :: Expire -> Log Activity -> M.Map UUID String -> UUID -> CommandStart +start (Expire expire) actlog descs u = + case lastact of + Just ent | notexpired ent -> checktrust (== DeadTrusted) $ do + showStart "unexpire" desc + showNote =<< whenactive + trustSet u SemiTrusted + _ -> checktrust (/= DeadTrusted) $ do + showStart "expire" desc + showNote =<< whenactive + trustSet u DeadTrusted + where + lastact = changed <$> M.lookup u actlog + whenactive = case lastact of + Just (Date t) -> do + d <- liftIO $ durationSince $ posixSecondsToUTCTime t + return $ "last active: " ++ fromDuration d ++ " ago" + _ -> return "no activity" + desc = fromUUID u ++ " " ++ fromMaybe "" (M.lookup u descs) + notexpired ent = case ent of + Unknown -> False + Date t -> case lookupexpire of + Just (Just expiretime) -> t >= expiretime + _ -> True + lookupexpire = headMaybe $ catMaybes $ + map (`M.lookup` expire) [Just u, Nothing] + checktrust want a = ifM (want <$> lookupTrust u) + ( do + void a + next $ next $ return True + , stop + ) + +data Expire = Expire (M.Map (Maybe UUID) (Maybe POSIXTime)) + +parseExpire :: [String] -> Annex Expire +parseExpire [] = error "Specify an expire time." +parseExpire ps = do + now <- liftIO getPOSIXTime + Expire . M.fromList <$> mapM (parse now) ps + where + parse now s = case separate (== ':') s of + (t, []) -> return (Nothing, parsetime now t) + (n, t) -> do + r <- Remote.nameToUUID n + return (Just r, parsetime now t) + parsetime _ "never" = Nothing + parsetime now s = case parseDuration s of + Nothing -> error $ "bad expire time: " ++ s + Just d -> Just (now - durationToPOSIXTime d) + +parseActivity :: Maybe String -> Maybe Activity +parseActivity Nothing = Nothing +parseActivity (Just s) = case readish s of + Nothing -> error $ "Unknown activity. Choose from: " ++ + unwords (map show [minBound..maxBound :: Activity]) + Just v -> Just v + diff --git a/Command/Fsck.hs b/Command/Fsck.hs index ec89a4351e..08753b6123 100644 --- a/Command/Fsck.hs +++ b/Command/Fsck.hs @@ -22,8 +22,8 @@ import Annex.Direct import Annex.Perms import Annex.Link import Logs.Location -import Logs.Presence import Logs.Trust +import Logs.Activity import Config.NumCopies import Annex.UUID import Utility.DataUnits @@ -39,7 +39,6 @@ import Data.Time.Clock.POSIX import Data.Time import System.Posix.Types (EpochTime) import System.Locale -import qualified Data.Map as M cmd :: [Command] cmd = [withOptions fsckOptions $ command "fsck" paramPaths seek @@ -58,22 +57,12 @@ incrementalScheduleOption :: Option incrementalScheduleOption = fieldOption [] "incremental-schedule" paramTime "schedule incremental fscking" -distributedOption :: Option -distributedOption = flagOption [] "distributed" "distributed fsck mode" - -expireOption :: Option -expireOption = fieldOption [] "expire" - (paramRepeating $ paramOptional paramRemote ++ ":" ++ paramTime) - "distributed expire mode" - fsckOptions :: [Option] fsckOptions = [ fsckFromOption , startIncrementalOption , moreIncrementalOption , incrementalScheduleOption - , distributedOption - , expireOption ] ++ keyOptions ++ annexedMatchingOptions seek :: CommandSeek @@ -81,28 +70,28 @@ seek ps = do from <- getOptionField fsckFromOption Remote.byNameWithUUID u <- maybe getUUID (pure . Remote.uuid) from i <- getIncremental u - d <- getDistributed withKeyOptions False - (\k -> startKey i d k =<< getNumCopies) - (withFilesInGit $ whenAnnexed $ start from i d) + (\k -> startKey i k =<< getNumCopies) + (withFilesInGit $ whenAnnexed $ start from i) ps withFsckDb i FsckDb.closeDb + recordActivity Fsck u -start :: Maybe Remote -> Incremental -> Distributed -> FilePath -> Key -> CommandStart -start from inc dist file key = do +start :: Maybe Remote -> Incremental -> FilePath -> Key -> CommandStart +start from inc file key = do v <- Backend.getBackend file key case v of Nothing -> stop Just backend -> do numcopies <- getFileNumCopies file case from of - Nothing -> go $ perform dist key file backend numcopies - Just r -> go $ performRemote dist key file backend numcopies r + Nothing -> go $ perform key file backend numcopies + Just r -> go $ performRemote key file backend numcopies r where go = runFsck inc file key -perform :: Distributed -> Key -> FilePath -> Backend -> NumCopies -> Annex Bool -perform dist key file backend numcopies = check +perform :: Key -> FilePath -> Backend -> NumCopies -> Annex Bool +perform key file backend numcopies = check -- order matters [ fixLink key file , verifyLocationLog key file @@ -110,14 +99,13 @@ perform dist key file backend numcopies = check , verifyDirectMode key file , checkKeySize key , checkBackend backend key (Just file) - , checkDistributed dist key Nothing , checkKeyNumCopies key file numcopies ] {- To fsck a remote, the content is retrieved to a tmp file, - and checked locally. -} -performRemote :: Distributed -> Key -> FilePath -> Backend -> NumCopies -> Remote -> Annex Bool -performRemote dist key file backend numcopies remote = +performRemote :: Key -> FilePath -> Backend -> NumCopies -> Remote -> Annex Bool +performRemote key file backend numcopies remote = dispatch =<< Remote.hasKey remote key where dispatch (Left err) = do @@ -136,7 +124,6 @@ performRemote dist key file backend numcopies remote = [ verifyLocationLogRemote key file remote present , checkKeySizeRemote key remote localcopy , checkBackendRemote backend key remote localcopy - , checkDistributed dist key (Just $ Remote.uuid remote) , checkKeyNumCopies key file numcopies ] withtmp a = do @@ -157,19 +144,18 @@ performRemote dist key file backend numcopies remote = ) dummymeter _ = noop -startKey :: Incremental -> Distributed -> Key -> NumCopies -> CommandStart -startKey inc dist key numcopies = +startKey :: Incremental -> Key -> NumCopies -> CommandStart +startKey inc key numcopies = case Backend.maybeLookupBackendName (Types.Key.keyBackendName key) of Nothing -> stop Just backend -> runFsck inc (key2file key) key $ - performKey dist key backend numcopies + performKey key backend numcopies -performKey :: Distributed -> Key -> Backend -> NumCopies -> Annex Bool -performKey dist key backend numcopies = check +performKey :: Key -> Backend -> NumCopies -> Annex Bool +performKey key backend numcopies = check [ verifyLocationLog key (key2file key) , checkKeySize key , checkBackend backend key Nothing - , checkDistributed dist key Nothing , checkKeyNumCopies key (key2file key) numcopies ] @@ -513,69 +499,3 @@ getIncremental u = do when (now - realToFrac started >= durationToPOSIXTime delta) $ resetStartTime u return True - -data Distributed - = NonDistributed - | Distributed POSIXTime - | DistributedExpire POSIXTime (M.Map (Maybe UUID) (Maybe POSIXTime)) - deriving (Show) - -getDistributed :: Annex Distributed -getDistributed = go =<< getOptionField expireOption parseexpire - where - go (Just m) = DistributedExpire <$> liftIO getPOSIXTime <*> pure m - go Nothing = ifM (getOptionFlag distributedOption) - ( Distributed <$> liftIO getPOSIXTime - , return NonDistributed - ) - - parseexpire Nothing = return Nothing - parseexpire (Just s) = do - now <- liftIO getPOSIXTime - Just . M.fromList <$> mapM (parseexpire' now) (words s) - parseexpire' now s = case separate (== ':') s of - (t, []) -> return (Nothing, parsetime now t) - (n, t) -> do - r <- Remote.nameToUUID n - return (Just r, parsetime now t) - parsetime _ "never" = Nothing - parsetime now s = case parseDuration s of - Nothing -> error $ "bad expire time: " ++ s - Just d -> Just (now - durationToPOSIXTime d) - -checkDistributed :: Distributed -> Key -> Maybe UUID -> Annex Bool -checkDistributed d k mu = do - go d - return True - where - go NonDistributed = noop - - -- This is called after fsck has checked the key's content, so - -- if the key is present in the annex now, we just need to update - -- the location log with the timestamp of the start of the fsck. - -- - -- Note that reusing this timestamp means that the same log line - -- is generated for each key, which keeps the size increase - -- of the git-annex branch down. - go (Distributed ts) = whenM (inAnnex k) $ do - u <- maybe getUUID return mu - logChange' (logThen ts) k u InfoPresent - - -- Get the location log for the key, and expire all entries - -- that are older than their uuid's listed expiration date. - -- (Except for the local repository.) - go (DistributedExpire ts m) = do - ls <- locationLog k - hereu <- getUUID - forM_ ls $ \l -> do - let u = toUUID (info l) - unless (u == hereu) $ - case lookupexpire u of - Just (Just expiretime) - | date l < expiretime -> - logChange' (logThen ts) k u InfoMissing - _ -> noop - where - lookupexpire u = headMaybe $ catMaybes $ - map (`M.lookup` m) [Just u, Nothing] - diff --git a/Logs.hs b/Logs.hs index 1f8cf9f9c0..252b5814d9 100644 --- a/Logs.hs +++ b/Logs.hs @@ -1,6 +1,6 @@ {- git-annex log file names - - - Copyright 2013-2014 Joey Hess + - Copyright 2013-2015 Joey Hess - - Licensed under the GNU GPL version 3 or higher. -} @@ -40,6 +40,7 @@ topLevelUUIDBasedLogs = , preferredContentLog , requiredContentLog , scheduleLog + , activityLog , differenceLog ] @@ -84,9 +85,13 @@ groupPreferredContentLog = "group-preferred-content.log" scheduleLog :: FilePath scheduleLog = "schedule.log" +activityLog :: FilePath +activityLog = "activity.log" + differenceLog :: FilePath differenceLog = "difference.log" + {- The pathname of the location log file for a given key. -} locationLogFile :: GitConfig -> Key -> String locationLogFile config key = branchHashDir config key keyFile key ++ ".log" diff --git a/Logs/Activity.hs b/Logs/Activity.hs new file mode 100644 index 0000000000..45262a6337 --- /dev/null +++ b/Logs/Activity.hs @@ -0,0 +1,37 @@ +{- git-annex activity log + - + - Copyright 2015 Joey Hess + - + - Licensed under the GNU GPL version 3 or higher. + -} + +module Logs.Activity ( + Log, + Activity(..), + recordActivity, + lastActivities, +) where + +import Data.Time.Clock.POSIX + +import Common.Annex +import qualified Annex.Branch +import Logs +import Logs.UUIDBased + +data Activity = Fsck + deriving (Eq, Read, Show, Enum, Bounded) + +recordActivity :: Activity -> UUID -> Annex () +recordActivity act uuid = do + ts <- liftIO getPOSIXTime + Annex.Branch.change activityLog $ + showLog id . changeLog ts uuid (show act) . parseLog readish + +lastActivities :: Maybe Activity -> Annex (Log Activity) +lastActivities wantact = parseLog onlywanted <$> Annex.Branch.get activityLog + where + onlywanted s = case readish s of + Just a | wanted a -> Just a + _ -> Nothing + wanted a = maybe True (a ==) wantact diff --git a/Logs/Location.hs b/Logs/Location.hs index 59375a5121..391edb6801 100644 --- a/Logs/Location.hs +++ b/Logs/Location.hs @@ -17,7 +17,6 @@ module Logs.Location ( LogStatus(..), logStatus, logChange, - logChange', loggedLocations, loggedLocationsHistorical, locationLog, diff --git a/Logs/Presence.hs b/Logs/Presence.hs index 469ed8de9f..60e0c542a7 100644 --- a/Logs/Presence.hs +++ b/Logs/Presence.hs @@ -16,7 +16,6 @@ module Logs.Presence ( addLog, readLog, logNow, - logThen, currentLog, currentLogInfo, historicalLogInfo, @@ -44,9 +43,6 @@ logNow s i = do now <- liftIO getPOSIXTime return $ LogLine now s i -logThen :: POSIXTime -> LogStatus -> String -> Annex LogLine -logThen t s i = return $ LogLine t s i - {- Reads a log and returns only the info that is still in effect. -} currentLogInfo :: FilePath -> Annex [String] currentLogInfo file = map info <$> currentLog file diff --git a/debian/changelog b/debian/changelog index f0aaf9b8e5..eea9bab0bb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -git-annex (5.20150328) UNRELEASED; urgency=medium +git-annex (5.20150405) UNRELEASED; urgency=medium * Prevent git-ls-files from double-expanding wildcards when an unexpanded wildcard is passed to a git-annex command like add or find. @@ -20,8 +20,8 @@ git-annex (5.20150328) UNRELEASED; urgency=medium multiple files. * import: --deduplicate and --cleanduplicates now output the keys corresponding to duplicated files they process. - * fsck: Added --distributed and --expire options, - for distributed fsck. + * expire: New command, for expiring inactive repositories. + * fsck: Record fsck activity for use by expire command. * Fix truncation of parameters that could occur when using xargs git-annex. * Significantly sped up processing of large numbers of directories passed to a single git-annex command. diff --git a/doc/git-annex-expire.mdwn b/doc/git-annex-expire.mdwn new file mode 100644 index 0000000000..ff167804a7 --- /dev/null +++ b/doc/git-annex-expire.mdwn @@ -0,0 +1,61 @@ +# NAME + +git-annex expire - expire inactive repositories + +# SYNOPSIS + +git annex expire `[repository:]time ...` + +# DESCRIPTION + +This command expires repositories that have not performed some activity +within a specified time period. A repository is expired by marking it as +dead. De-expiration is also done; if a dead repository performed some +activity recently, it is marked as semitrusted again. + +This can be useful when it's not possible to keep track of the state +of repositories manually. For example, a distributed network of +repositories where nobody can directly access all the repositories to +check their status. + +The repository can be specified using the name of a remote, +or the description or uuid of the repository. + +The time is in the form "60d" or "1y". A time of "never" will disable +expiration. + +If a time is specified without a repository, it is used as the default +value for all repositories. Note that the current repository is never +expired. + +# OPTIONS + +* `--activity=Name` + + Specify the activity that a repository must have performed to avoid being + expired. The default is any activity. + + Currently, the only activity that can be performed to avoid expiration + is `git annex fsck`. Note that fscking a remote updates the + expiration of the remote repository, not the local repository. + + The first version of git-annex that recorded fsck activity was + 5.20150405. + +# SEE ALSO + +[[git-annex]](1) + +[[git-annex-fsck]](1) + +[[git-annex-schedule]](1) + +[[git-annex-dead]](1) + +[[git-annex-semitrust]](1) + +# AUTHOR + +Joey Hess + +Warning: Automatically converted into a man page by mdwn2man. Edit with care. diff --git a/doc/git-annex-fsck.mdwn b/doc/git-annex-fsck.mdwn index 1f5d75f3ee..cb27fe4526 100644 --- a/doc/git-annex-fsck.mdwn +++ b/doc/git-annex-fsck.mdwn @@ -53,44 +53,6 @@ With parameters, only the specified files are checked. git annex fsck --incremental-schedule 30d --time-limit 5h -* `--distributed` - - Normally, fsck only fixes the git-annex location logs when an inconsistecy - is detected. In distributed mode, each file that is checked will result - in a location log update noting the time that it was present. - - This is useful in situations where repositories cannot be trusted to - continue to exist. By running a periodic distributed fsck, those - repositories can verify that they still exist and that the information - about their contents is still accurate. - - This is not the default mode, because each distributed fsck increases - the size of the git-annex branch. While it takes care to log identical - location tracking lines for all keys, which will delta-compress well, - there is still overhead in committing the changes. If this causes - the git-annex branch to grow too big, it can be pruned using - [[git-annex-forget]](1) - -* `--expire="[repository:]time`..." - - This option makes the fsck check for location logs of the specified - repository that have not been updated by a distributed fsck within the - specified time period. Such stale location logs are then thrown out, so - git-annex will no longer think that a repository contains data, if it is - not participating in distributed fscking. - - The repository can be specified using the name of a remote, - or the description or uuid of the repository. If a time is specified - without a repository, it is used as the default value for all - repositories. Note that location logs for the current repository are - never expired, since they can be verified directly. - - The time is in the form "60d" or "1y". A time of "never" will disable - expiration. - - Note that a remote can always run `fsck` later on to re-update the - location log if it was expired in error. - * `--numcopies=N` Override the normally configured number of copies. diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 27439cd3ad..11fbc6e017 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -302,6 +302,11 @@ subdirectories). See [[git-annex-fsck]](1) for details. +* `expire [repository:]time ...` + + Expires repositories that have not recently performed an activity + (such as a fsck). + * `unused` Checks the annex for data that does not correspond to any files present diff --git a/doc/internals.mdwn b/doc/internals.mdwn index 824655a92a..7e8f4b19a0 100644 --- a/doc/internals.mdwn +++ b/doc/internals.mdwn @@ -247,6 +247,14 @@ Example: 42bf2035-0636-461d-a367-49e9dfd361dd fsck self 30m every day at any time; fsck 4b3ebc86-0faf-4892-83c5-ce00cbe30f0a 1h every year at any time timestamp=1385646997.053162s +## `activity.log` + +Used to record the times of activities, such as fscks. + +Example: + + 42bf2035-0636-461d-a367-49e9dfd361dd Fsck timestamp=1422387398.30395s + ## `transitions.log` Used to record transitions, eg by `git annex forget`