forget --drop-dead: Completely removes mentions of repositories that have been marked as dead from the git-annex branch.

Wrote nice pure transition calculator, and ugly code to stage its results
into the git-annex branch. Also had to split up several Log modules
that Annex.Branch needed to use, but that themselves used Annex.Branch.

The transition calculator is limited to looking at and changing one file at
a time. While this made the implementation relatively easy, it precludes
transitions that do stuff like deleting old url log files for keys that are
being removed because they are no longer present anywhere.
This commit is contained in:
Joey Hess 2013-08-31 17:38:33 -04:00
parent 2f57d74534
commit 0831e18372
10 changed files with 279 additions and 155 deletions

View file

@ -46,8 +46,12 @@ import Annex.CatFile
import Annex.Perms
import qualified Annex
import Utility.Env
import Logs
import Logs.Transitions
import Logs.Trust.Pure
import Annex.ReplaceFile
import qualified Annex.Queue
import Annex.Branch.Transitions
{- Name of the branch that is used to store git-annex's information. -}
name :: Git.Ref
@ -194,7 +198,10 @@ get' :: FilePath -> Annex String
get' file = go =<< getJournalFile file
where
go (Just journalcontent) = return journalcontent
go Nothing = withIndex $ L.unpack <$> catFile fullname file
go Nothing = getRaw file
getRaw :: FilePath -> Annex String
getRaw file = withIndex $ L.unpack <$> catFile fullname file
{- Applies a function to modifiy the content of a file.
-
@ -272,13 +279,17 @@ commitBranch' branchref message parents = do
files :: Annex [FilePath]
files = do
update
withIndex $ do
bfiles <- inRepo $ Git.Command.pipeNullSplitZombie
[ Params "ls-tree --name-only -r -z"
, Param $ show fullname
]
jfiles <- getJournalledFiles
return $ jfiles ++ bfiles
(++)
<$> branchFiles
<*> getJournalledFiles
{- Files in the branch, not including any from journalled changes,
- and without updating the branch. -}
branchFiles :: Annex [FilePath]
branchFiles = withIndex $ inRepo $ Git.Command.pipeNullSplitZombie
[ Params "ls-tree --name-only -r -z"
, Param $ show fullname
]
{- Populates the branch's index file with the current branch contents.
-
@ -436,20 +447,60 @@ getIgnoredRefs = S.fromList . mapMaybe Git.Sha.extractSha . lines <$> content
- commits it to the branch, or creates a new branch, and returns
- the branch's ref. -}
performTransitions :: Transitions -> Bool -> Annex Git.Ref
performTransitions ts neednewbranch = withIndex $ do
when (inTransitions ForgetDeadRemotes ts) $
error "TODO ForgetDeadRemotes transition"
if neednewbranch
then do
committedref <- inRepo $ Git.Branch.commit message fullname []
setIndexSha committedref
return committedref
else do
ref <- getBranch
commitBranch ref message [fullname]
getBranch
performTransitions ts neednewbranch = do
-- For simplicity & speed, we're going to use the Annex.Queue to
-- update the git-annex branch, while it usually holds changes
-- for the head branch. Flush any such changes.
Annex.Queue.flush
withIndex $ do
run $ mapMaybe getTransitionCalculator $ transitionList ts
Annex.Queue.flush
if neednewbranch
then do
committedref <- inRepo $ Git.Branch.commit message fullname []
setIndexSha committedref
return committedref
else do
ref <- getBranch
commitBranch ref message [fullname]
getBranch
where
message
| neednewbranch = "new branch for transition " ++ tdesc
| otherwise = "continuing transition " ++ tdesc
tdesc = show $ map describeTransition $ transitionList ts
{- The changes to make to the branch are calculated and applied to
- the branch directly, rather than going through the journal,
- which would be innefficient. (And the journal is not designed
- to hold changes to every file in the branch at once.)
-
- When a file in the branch is changed by transition code,
- that value is remembered and fed into the code for subsequent
- transitions.
-}
run [] = noop
run changers = do
trustmap <- calcTrustMap <$> getRaw trustLog
fs <- branchFiles
hasher <- inRepo hashObjectStart
forM_ fs $ \f -> do
content <- getRaw f
apply changers hasher f content trustmap
liftIO $ hashObjectStop hasher
apply [] _ _ _ _ = return ()
apply (changer:rest) hasher file content trustmap =
case changer file content trustmap of
RemoveFile -> do
Annex.Queue.addUpdateIndex
=<< inRepo (Git.UpdateIndex.unstageFile file)
-- File is deleted; can't run any other
-- transitions on it.
return ()
ChangeFile content' -> do
sha <- inRepo $ hashObject BlobObject content'
Annex.Queue.addUpdateIndex $ Git.UpdateIndex.pureStreamer $
Git.UpdateIndex.updateIndexLine sha FileBlob (asTopFilePath file)
apply rest hasher file content' trustmap
PreserveFile ->
apply rest hasher file content trustmap

View file

@ -0,0 +1,53 @@
{- git-annex branch transitions
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Annex.Branch.Transitions (
FileTransition(..),
getTransitionCalculator
) where
import Logs
import Logs.Transitions
import Logs.UUIDBased as UUIDBased
import Logs.Presence.Pure as Presence
import Types.TrustLevel
import Types.UUID
import qualified Data.Map as M
data FileTransition
= ChangeFile String
| RemoveFile
| PreserveFile
type TransitionCalculator = FilePath -> String -> TrustMap -> FileTransition
getTransitionCalculator :: Transition -> Maybe TransitionCalculator
getTransitionCalculator ForgetGitHistory = Nothing
getTransitionCalculator ForgetDeadRemotes = Just dropDead
dropDead :: FilePath -> String -> TrustMap -> FileTransition
dropDead f content trustmap = case getLogVariety f of
Just UUIDBasedLog -> ChangeFile $
UUIDBased.showLog id $ dropDeadFromUUIDBasedLog trustmap $ UUIDBased.parseLog Just content
Just (PresenceLog _) ->
let newlog = Presence.compactLog $ dropDeadFromPresenceLog trustmap $ Presence.parseLog content
in if null newlog
then RemoveFile
else ChangeFile $ Presence.showLog newlog
Nothing -> PreserveFile
dropDeadFromUUIDBasedLog :: TrustMap -> UUIDBased.Log String -> UUIDBased.Log String
dropDeadFromUUIDBasedLog trustmap = M.filterWithKey $ notDead trustmap . const
{- Presence logs can contain UUIDs or other values. Any line that matches
- a dead uuid is dropped; any other values are passed through. -}
dropDeadFromPresenceLog :: TrustMap -> [Presence.LogLine] -> [Presence.LogLine]
dropDeadFromPresenceLog trustmap = filter $ notDead trustmap (toUUID . Presence.info)
notDead :: TrustMap -> (v -> UUID) -> v -> Bool
notDead trustmap a v = M.findWithDefault SemiTrusted (a v) trustmap /= DeadTrusted

View file

@ -12,30 +12,41 @@ import Command
import qualified Annex.Branch as Branch
import Logs.Transitions
import qualified Annex
import qualified Option
import Data.Time.Clock.POSIX
def :: [Command]
def = [command "forget" paramNothing seek
def = [withOptions forgetOptions $ command "forget" paramNothing seek
SectionMaintenance "prune git-annex branch history"]
forgetOptions :: [Option]
forgetOptions = [dropDeadOption]
dropDeadOption :: Option
dropDeadOption = Option.flag [] "drop-dead" "drop references to dead repositories"
seek :: [CommandSeek]
seek = [withNothing start]
seek = [withFlag dropDeadOption $ \dropdead ->
withNothing $ start dropdead]
start :: CommandStart
start = do
start :: Bool -> CommandStart
start dropdead = do
showStart "forget" "git-annex"
next $ perform =<< Annex.getState Annex.force
perform :: Bool -> CommandPerform
perform True = do
now <- liftIO getPOSIXTime
let ts = addTransition now ForgetGitHistory noTransitions
let basets = addTransition now ForgetGitHistory noTransitions
let ts = if dropdead
then addTransition now ForgetDeadRemotes basets
else basets
next $ perform ts =<< Annex.getState Annex.force
perform :: Transitions -> Bool -> CommandPerform
perform ts True = do
recordTransitions Branch.change ts
-- get branch committed before contining with the transition
Branch.update
void $ Branch.performTransitions ts True
next $ return True
perform False = do
perform _ False = do
showLongNote "To forget git-annex branch history, you must specify --force. This deletes metadata!"
stop

View file

@ -12,36 +12,18 @@
-}
module Logs.Presence (
LogStatus(..),
LogLine(LogLine),
module X,
addLog,
readLog,
getLog,
parseLog,
showLog,
logNow,
compactLog,
currentLog,
prop_parse_show_log,
currentLog
) where
import Data.Time.Clock.POSIX
import Data.Time
import System.Locale
import qualified Data.Map as M
import Logs.Presence.Pure as X
import Common.Annex
import qualified Annex.Branch
import Utility.QuickCheck
data LogLine = LogLine {
date :: POSIXTime,
status :: LogStatus,
info :: String
} deriving (Eq, Show)
data LogStatus = InfoPresent | InfoMissing
deriving (Eq, Show, Bounded, Enum)
addLog :: FilePath -> LogLine -> Annex ()
addLog file line = Annex.Branch.change file $ \s ->
@ -52,29 +34,6 @@ addLog file line = Annex.Branch.change file $ \s ->
readLog :: FilePath -> Annex [LogLine]
readLog = parseLog <$$> Annex.Branch.get
{- Parses a log file. Unparseable lines are ignored. -}
parseLog :: String -> [LogLine]
parseLog = mapMaybe parseline . lines
where
parseline l = LogLine
<$> (utcTimeToPOSIXSeconds <$> parseTime defaultTimeLocale "%s%Qs" d)
<*> parsestatus s
<*> pure rest
where
(d, pastd) = separate (== ' ') l
(s, rest) = separate (== ' ') pastd
parsestatus "1" = Just InfoPresent
parsestatus "0" = Just InfoMissing
parsestatus _ = Nothing
{- Generates a log file. -}
showLog :: [LogLine] -> String
showLog = unlines . map genline
where
genline (LogLine d s i) = unwords [show d, genstatus s, i]
genstatus InfoPresent = "1"
genstatus InfoMissing = "0"
{- Generates a new LogLine with the current date. -}
logNow :: LogStatus -> String -> Annex LogLine
logNow s i = do
@ -84,39 +43,3 @@ logNow s i = do
{- Reads a log and returns only the info that is still in effect. -}
currentLog :: FilePath -> Annex [String]
currentLog file = map info . filterPresent <$> readLog file
{- Given a log, returns only the info that is are still in effect. -}
getLog :: String -> [String]
getLog = map info . filterPresent . parseLog
{- Returns the info from LogLines that are in effect. -}
filterPresent :: [LogLine] -> [LogLine]
filterPresent = filter (\l -> InfoPresent == status l) . compactLog
{- Compacts a set of logs, returning a subset that contains the current
- status. -}
compactLog :: [LogLine] -> [LogLine]
compactLog = M.elems . foldr mapLog M.empty
type LogMap = M.Map String LogLine
{- Inserts a log into a map of logs, if the log has better (ie, newer)
- information than the other logs in the map -}
mapLog :: LogLine -> LogMap -> LogMap
mapLog l m
| better = M.insert i l m
| otherwise = m
where
better = maybe True newer $ M.lookup i m
newer l' = date l' <= date l
i = info l
instance Arbitrary LogLine where
arbitrary = LogLine
<$> arbitrary
<*> elements [minBound..maxBound]
<*> arbitrary `suchThat` ('\n' `notElem`)
prop_parse_show_log :: [LogLine] -> Bool
prop_parse_show_log l = parseLog (showLog l) == l

84
Logs/Presence/Pure.hs Normal file
View file

@ -0,0 +1,84 @@
{- git-annex presence log, pure operations
-
- Copyright 2010-2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Logs.Presence.Pure where
import Data.Time.Clock.POSIX
import Data.Time
import System.Locale
import qualified Data.Map as M
import Common.Annex
import Utility.QuickCheck
data LogLine = LogLine {
date :: POSIXTime,
status :: LogStatus,
info :: String
} deriving (Eq, Show)
data LogStatus = InfoPresent | InfoMissing
deriving (Eq, Show, Bounded, Enum)
{- Parses a log file. Unparseable lines are ignored. -}
parseLog :: String -> [LogLine]
parseLog = mapMaybe parseline . lines
where
parseline l = LogLine
<$> (utcTimeToPOSIXSeconds <$> parseTime defaultTimeLocale "%s%Qs" d)
<*> parsestatus s
<*> pure rest
where
(d, pastd) = separate (== ' ') l
(s, rest) = separate (== ' ') pastd
parsestatus "1" = Just InfoPresent
parsestatus "0" = Just InfoMissing
parsestatus _ = Nothing
{- Generates a log file. -}
showLog :: [LogLine] -> String
showLog = unlines . map genline
where
genline (LogLine d s i) = unwords [show d, genstatus s, i]
genstatus InfoPresent = "1"
genstatus InfoMissing = "0"
{- Given a log, returns only the info that is are still in effect. -}
getLog :: String -> [String]
getLog = map info . filterPresent . parseLog
{- Returns the info from LogLines that are in effect. -}
filterPresent :: [LogLine] -> [LogLine]
filterPresent = filter (\l -> InfoPresent == status l) . compactLog
{- Compacts a set of logs, returning a subset that contains the current
- status. -}
compactLog :: [LogLine] -> [LogLine]
compactLog = M.elems . foldr mapLog M.empty
type LogMap = M.Map String LogLine
{- Inserts a log into a map of logs, if the log has better (ie, newer)
- information than the other logs in the map -}
mapLog :: LogLine -> LogMap -> LogMap
mapLog l m
| better = M.insert i l m
| otherwise = m
where
better = maybe True newer $ M.lookup i m
newer l' = date l' <= date l
i = info l
instance Arbitrary LogLine where
arbitrary = LogLine
<$> arbitrary
<*> elements [minBound..maxBound]
<*> arbitrary `suchThat` ('\n' `notElem`)
prop_parse_show_log :: [LogLine] -> Bool
prop_parse_show_log l = parseLog (showLog l) == l

View file

@ -7,16 +7,6 @@
- done that is listed in the remote branch by checking that the local
- branch contains the same transition, with the same or newer start time.
-
- When a remote branch that has had an transition performed on it
- becomes available for merging into the local git-annex branch,
- the transition is first performed on the local branch.
-
- When merging a remote branch into the local git-annex branch,
- all transitions that have been performed on the local branch must also
- have been performed on the remote branch already. (Or it would be
- possible to perform the transitions on a fixup branch and merge it,
- but that would be expensive.)
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
@ -86,9 +76,6 @@ parseTransitionLine s = TransitionLine <$> pdate ds <*> readish ts
combineTransitions :: [Transitions] -> Transitions
combineTransitions = S.unions
inTransitions :: Transition -> Transitions -> Bool
inTransitions t = not . S.null . S.filter (\l -> transition l == t)
transitionList :: Transitions -> [Transition]
transitionList = map transition . S.elems

View file

@ -6,6 +6,7 @@
-}
module Logs.Trust (
module X,
trustLog,
TrustLevel(..),
trustGet,
@ -16,8 +17,6 @@ module Logs.Trust (
lookupTrust,
trustMapLoad,
trustMapRaw,
prop_parse_show_TrustLog,
) where
import qualified Data.Map as M
@ -31,6 +30,7 @@ import Logs
import Logs.UUIDBased
import Remote.List
import qualified Types.Remote
import Logs.Trust.Pure as X
{- Returns a list of UUIDs that the trustLog indicates have the
- specified trust level.
@ -94,26 +94,4 @@ trustMapLoad = do
{- Does not include forcetrust or git config values, just those from the
- log file. -}
trustMapRaw :: Annex TrustMap
trustMapRaw = simpleMap . parseLog (Just . parseTrustLog)
<$> Annex.Branch.get trustLog
{- The trust.log used to only list trusted repos, without a field for the
- trust status, which is why this defaults to Trusted. -}
parseTrustLog :: String -> TrustLevel
parseTrustLog s = maybe Trusted parse $ headMaybe $ words s
where
parse "1" = Trusted
parse "0" = UnTrusted
parse "X" = DeadTrusted
parse _ = SemiTrusted
showTrustLog :: TrustLevel -> String
showTrustLog Trusted = "1"
showTrustLog UnTrusted = "0"
showTrustLog DeadTrusted = "X"
showTrustLog SemiTrusted = "?"
prop_parse_show_TrustLog :: Bool
prop_parse_show_TrustLog = all check [minBound .. maxBound]
where
check l = parseTrustLog (showTrustLog l) == l
trustMapRaw = calcTrustMap <$> Annex.Branch.get trustLog

36
Logs/Trust/Pure.hs Normal file
View file

@ -0,0 +1,36 @@
{- git-annex trust log, pure operations
-
- Copyright 2010-2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Logs.Trust.Pure where
import Common.Annex
import Types.TrustLevel
import Logs.UUIDBased
calcTrustMap :: String -> TrustMap
calcTrustMap = simpleMap . parseLog (Just . parseTrustLog)
{- The trust.log used to only list trusted repos, without a field for the
- trust status, which is why this defaults to Trusted. -}
parseTrustLog :: String -> TrustLevel
parseTrustLog s = maybe Trusted parse $ headMaybe $ words s
where
parse "1" = Trusted
parse "0" = UnTrusted
parse "X" = DeadTrusted
parse _ = SemiTrusted
showTrustLog :: TrustLevel -> String
showTrustLog Trusted = "1"
showTrustLog UnTrusted = "0"
showTrustLog DeadTrusted = "X"
showTrustLog SemiTrusted = "?"
prop_parse_show_TrustLog :: Bool
prop_parse_show_TrustLog = all check [minBound .. maxBound]
where
check l = parseTrustLog (showTrustLog l) == l

2
debian/changelog vendored
View file

@ -3,6 +3,8 @@ git-annex (4.20130828) UNRELEASED; urgency=low
* forget: New command, causes git-annex branch history to be forgotten
in a way that will spread to other clones of the repository.
(As long as they're running this version or newer of git-annex.)
* forget --drop-dead: Completely removes mentions of repositories that
have been marked as dead from the git-annex branch.
* sync, assistant: Force push of the git-annex branch. Necessary
to ensure it gets pushed to remotes after being rewritten by forget.

View file

@ -482,17 +482,16 @@ subdirectories).
* forget
Causes the git-annex branch to be rewritten, throwing away historical
data about past locations of files, files that are no longer present on
any remote, etc. The resulting branch will use less space, but for
example `git annex log` will not be able to show where files used to
be located.
data about past locations of files. The resulting branch will use less
space, but `git annex log` will not be able to show where
files used to be located.
To also prune references to remotes that have been marked as dead,
specify --dead.
To also prune references to repositories that have been marked as dead,
specify --drop-dead.
When this rewritten branch is merged into other clones of
the repository, git-annex will automatically perform the same rewriting
to their local git-annex branch. So the forgetfulness will automatically
to their local git-annex branches. So the forgetfulness will automatically
propigate out from its starting point until all repositories running
git-annex have forgotten their old history. (You may need to force
git to push the branch to any git repositories not running git-annex.