automatically launch git repository repair

Added a RemoteChecker thread, that waits for problems to be reported with
remotes, and checks if their git repository is in need of repair.

Currently, only failures to sync with the remote cause a problem to be
reported. This seems enough, but we'll see.

Plugging in a removable drive with a repository on it that is corrupted
does automatically repair the repository, as long as the corruption causes
git push or git pull to fail. Some types of corruption do not, eg
missing/corrupt objects for blobs that git push doesn't need to look at.

So, this is not really a replacement for scheduled git repository fscking.
But it does make the assistant more robust.

This commit is sponsored by Fernando Jimenez.
This commit is contained in:
Joey Hess 2013-10-27 16:42:13 -04:00
parent 3c08fee76b
commit a7821c0581
11 changed files with 129 additions and 39 deletions

View file

@ -23,6 +23,7 @@ import Assistant.Threads.TransferWatcher
import Assistant.Threads.Transferrer import Assistant.Threads.Transferrer
import Assistant.Threads.SanityChecker import Assistant.Threads.SanityChecker
import Assistant.Threads.Cronner import Assistant.Threads.Cronner
import Assistant.Threads.RemoteChecker
#ifdef WITH_CLIBS #ifdef WITH_CLIBS
import Assistant.Threads.MountWatcher import Assistant.Threads.MountWatcher
#endif #endif
@ -129,6 +130,7 @@ startDaemon assistant foreground startdelay listenhost startbrowser = do
, assist $ daemonStatusThread , assist $ daemonStatusThread
, assist $ sanityCheckerDailyThread , assist $ sanityCheckerDailyThread
, assist $ sanityCheckerHourlyThread , assist $ sanityCheckerHourlyThread
, assist $ remoteCheckerThread urlrenderer
#ifdef WITH_CLIBS #ifdef WITH_CLIBS
, assist $ mountWatcherThread , assist $ mountWatcherThread
#endif #endif

View file

@ -19,6 +19,7 @@ import Git.Remote (RemoteName)
import Data.String import Data.String
import qualified Data.Text as T import qualified Data.Text as T
import qualified Control.Exception as E
#ifdef WITH_WEBAPP #ifdef WITH_WEBAPP
import Assistant.Monad import Assistant.Monad
@ -174,6 +175,17 @@ fsckAlert button n = baseActivityAlert
, alertButton = Just button , alertButton = Just button
} }
showFscking :: UrlRenderer -> Maybe RemoteName -> IO (Either E.SomeException a) -> Assistant a
showFscking urlrenderer remotename a = do
#ifdef WITH_WEBAPP
button <- mkAlertButton False (T.pack "Configure") urlrenderer ConfigFsckR
r <- alertDuring (fsckAlert button remotename) $
liftIO a
either (liftIO . E.throwIO) return r
#else
a
#endif
brokenRepositoryAlert :: AlertButton -> Alert brokenRepositoryAlert :: AlertButton -> Alert
brokenRepositoryAlert = errorAlert "Serious problems have been detected with your repository. This needs your immediate attention!" brokenRepositoryAlert = errorAlert "Serious problems have been detected with your repository. This needs your immediate attention!"

View file

@ -39,6 +39,7 @@ import Assistant.Types.Pushes
import Assistant.Types.BranchChange import Assistant.Types.BranchChange
import Assistant.Types.Commits import Assistant.Types.Commits
import Assistant.Types.Changes import Assistant.Types.Changes
import Assistant.Types.RemoteProblem
import Assistant.Types.Buddies import Assistant.Types.Buddies
import Assistant.Types.NetMessager import Assistant.Types.NetMessager
import Assistant.Types.ThreadName import Assistant.Types.ThreadName
@ -63,6 +64,7 @@ data AssistantData = AssistantData
, failedPushMap :: FailedPushMap , failedPushMap :: FailedPushMap
, commitChan :: CommitChan , commitChan :: CommitChan
, changePool :: ChangePool , changePool :: ChangePool
, remoteProblemChan :: RemoteProblemChan
, branchChangeHandle :: BranchChangeHandle , branchChangeHandle :: BranchChangeHandle
, buddyList :: BuddyList , buddyList :: BuddyList
, netMessager :: NetMessager , netMessager :: NetMessager
@ -80,6 +82,7 @@ newAssistantData st dstatus = AssistantData
<*> newFailedPushMap <*> newFailedPushMap
<*> newCommitChan <*> newCommitChan
<*> newChangePool <*> newChangePool
<*> newRemoteProblemChan
<*> newBranchChangeHandle <*> newBranchChangeHandle
<*> newBuddyList <*> newBuddyList
<*> newNetMessager <*> newNetMessager

View file

@ -0,0 +1,23 @@
{- git-annex assistant remote problem handling
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Assistant.RemoteProblem where
import Assistant.Common
import Utility.TList
import Control.Concurrent.STM
{- Gets all remotes that have problems.
- Blocks until there is at least one. -}
getRemoteProblems :: Assistant [Remote]
getRemoteProblems = (atomically . getTList) <<~ remoteProblemChan
{- Indicates that there was a problem accessing a remote, and the problem
- appears to not be a transient (eg network connection) problem. -}
remoteHasProblem :: Remote -> Assistant ()
remoteHasProblem r = (atomically . flip consTList r) <<~ remoteProblemChan

View file

@ -33,24 +33,21 @@ import Control.Concurrent.Async
{- When the FsckResults require a repair, tries to do a non-destructive {- When the FsckResults require a repair, tries to do a non-destructive
- repair. If that fails, pops up an alert. -} - repair. If that fails, pops up an alert. -}
repairWhenNecessary :: UrlRenderer -> UUID -> Maybe Remote -> FsckResults -> Assistant () repairWhenNecessary :: UrlRenderer -> UUID -> Maybe Remote -> FsckResults -> Assistant Bool
repairWhenNecessary urlrenderer u mrmt fsckresults repairWhenNecessary urlrenderer u mrmt fsckresults
| foundBroken fsckresults = do | foundBroken fsckresults = do
liftAnnex $ writeFsckResults u fsckresults liftAnnex $ writeFsckResults u fsckresults
repodesc <- liftAnnex $ Remote.prettyUUID u repodesc <- liftAnnex $ Remote.prettyUUID u
handle =<< alertDuring (repairingAlert repodesc) ok <- alertWhile (repairingAlert repodesc)
(runRepair u mrmt False) (runRepair u mrmt False)
| otherwise = noop
where
handle True = return ()
handle False = do
#ifdef WITH_WEBAPP #ifdef WITH_WEBAPP
unless ok $ do
button <- mkAlertButton True (T.pack "Click Here") urlrenderer $ button <- mkAlertButton True (T.pack "Click Here") urlrenderer $
RepairRepositoryR u RepairRepositoryR u
void $ addAlert $ brokenRepositoryAlert button void $ addAlert $ brokenRepositoryAlert button
#else return ok
return ()
#endif #endif
| otherwise = return False
runRepair :: UUID -> Maybe Remote -> Bool -> Assistant Bool runRepair :: UUID -> Maybe Remote -> Bool -> Assistant Bool
runRepair u mrmt destructiverepair = do runRepair u mrmt destructiverepair = do

View file

@ -33,6 +33,7 @@ import Assistant.NamedThread
import Assistant.Threads.Watcher (watchThread, WatcherControl(..)) import Assistant.Threads.Watcher (watchThread, WatcherControl(..))
import Assistant.TransferSlots import Assistant.TransferSlots
import Assistant.TransferQueue import Assistant.TransferQueue
import Assistant.RemoteProblem
import Logs.Transfer import Logs.Transfer
import Data.Time.Clock import Data.Time.Clock
@ -59,11 +60,14 @@ import Control.Concurrent
reconnectRemotes :: Bool -> [Remote] -> Assistant () reconnectRemotes :: Bool -> [Remote] -> Assistant ()
reconnectRemotes _ [] = noop reconnectRemotes _ [] = noop
reconnectRemotes notifypushes rs = void $ do reconnectRemotes notifypushes rs = void $ do
rs' <- filterM (checkavailable . Remote.repo) rs rs' <- liftIO $ filterM (Remote.checkAvailable True) rs
unless (null rs') $ do unless (null rs') $ do
modifyDaemonStatus_ $ \s -> s modifyDaemonStatus_ $ \s -> s
{ desynced = S.union (S.fromList $ map Remote.uuid xmppremotes) (desynced s) } { desynced = S.union (S.fromList $ map Remote.uuid xmppremotes) (desynced s) }
failedrs <- syncAction rs' (const go) failedrs <- syncAction rs' (const go)
forM_ failedrs $ \r ->
whenM (liftIO $ Remote.checkAvailable False r) $
remoteHasProblem r
mapM_ signal $ filter (`notElem` failedrs) rs' mapM_ signal $ filter (`notElem` failedrs) rs'
where where
gitremotes = filter (notspecialremote . Remote.repo) rs gitremotes = filter (notspecialremote . Remote.repo) rs
@ -90,10 +94,6 @@ reconnectRemotes notifypushes rs = void $ do
signal r = liftIO . mapM_ (flip tryPutMVar ()) signal r = liftIO . mapM_ (flip tryPutMVar ())
=<< fromMaybe [] . M.lookup (Remote.uuid r) . connectRemoteNotifiers =<< fromMaybe [] . M.lookup (Remote.uuid r) . connectRemoteNotifiers
<$> getDaemonStatus <$> getDaemonStatus
checkavailable r
| Git.repoIsLocal r || Git.repoIsLocalUnknown r =
liftIO $ doesDirectoryExist $ Git.repoPath r
| otherwise = return True
{- Updates the local sync branch, then pushes it to all remotes, in {- Updates the local sync branch, then pushes it to all remotes, in
- parallel, along with the git-annex branch. This is the same - parallel, along with the git-annex branch. This is the same

View file

@ -5,7 +5,7 @@
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
{-# LANGUAGE DeriveDataTypeable, CPP #-} {-# LANGUAGE DeriveDataTypeable #-}
module Assistant.Threads.Cronner ( module Assistant.Threads.Cronner (
cronnerThread cronnerThread
@ -29,10 +29,6 @@ import Assistant.Types.UrlRenderer
import Assistant.Alert import Assistant.Alert
import Remote import Remote
import qualified Types.Remote as Remote import qualified Types.Remote as Remote
#ifdef WITH_WEBAPP
import Assistant.WebApp.Types
#endif
import Git.Remote (RemoteName)
import qualified Git.Fsck import qualified Git.Fsck
import Assistant.Repair import Assistant.Repair
import qualified Git import qualified Git
@ -43,8 +39,6 @@ import Data.Time.LocalTime
import Data.Time.Clock import Data.Time.Clock
import qualified Data.Map as M import qualified Data.Map as M
import qualified Data.Set as S import qualified Data.Set as S
import qualified Control.Exception as E
import qualified Data.Text as T
{- Loads schedules for this repository, and fires off one thread for each {- Loads schedules for this repository, and fires off one thread for each
- scheduled event that runs on this repository. Each thread sleeps until - scheduled event that runs on this repository. Each thread sleeps until
@ -191,7 +185,7 @@ runActivity' urlrenderer (ScheduledSelfFsck _ d) = do
void $ batchCommand program (Param "fsck" : annexFsckParams d) void $ batchCommand program (Param "fsck" : annexFsckParams d)
Git.Fsck.findBroken True g Git.Fsck.findBroken True g
u <- liftAnnex getUUID u <- liftAnnex getUUID
repairWhenNecessary urlrenderer u Nothing fsckresults void $ repairWhenNecessary urlrenderer u Nothing fsckresults
mapM_ reget =<< liftAnnex (dirKeys gitAnnexBadDir) mapM_ reget =<< liftAnnex (dirKeys gitAnnexBadDir)
where where
reget k = queueTransfers "fsck found bad file; redownloading" Next k Nothing Download reget k = queueTransfers "fsck found bad file; redownloading" Next k Nothing Download
@ -220,18 +214,7 @@ runActivity' urlrenderer (ScheduledRemoteFsck u s d) = handle =<< liftAnnex (rem
if Git.repoIsLocal r && not (Git.repoIsLocalUnknown r) if Git.repoIsLocal r && not (Git.repoIsLocalUnknown r)
then Just <$> Git.Fsck.findBroken True r then Just <$> Git.Fsck.findBroken True r
else pure Nothing else pure Nothing
maybe noop (repairWhenNecessary urlrenderer u (Just rmt)) fsckresults maybe noop (void . repairWhenNecessary urlrenderer u (Just rmt)) fsckresults
showFscking :: UrlRenderer -> Maybe RemoteName -> IO (Either E.SomeException a) -> Assistant a
showFscking urlrenderer remotename a = do
#ifdef WITH_WEBAPP
button <- mkAlertButton False (T.pack "Configure") urlrenderer ConfigFsckR
r <- alertDuring (fsckAlert button remotename) $
liftIO a
either (liftIO . E.throwIO) return r
#else
a
#endif
annexFsckParams :: Duration -> [CommandParam] annexFsckParams :: Duration -> [CommandParam]
annexFsckParams d = annexFsckParams d =

View file

@ -13,6 +13,7 @@ import Assistant.Pushes
import Assistant.DaemonStatus import Assistant.DaemonStatus
import Assistant.Sync import Assistant.Sync
import Utility.ThreadScheduler import Utility.ThreadScheduler
import qualified Remote
import qualified Types.Remote as Remote import qualified Types.Remote as Remote
{- This thread retries pushes that failed before. -} {- This thread retries pushes that failed before. -}
@ -42,7 +43,7 @@ pushThread = namedThread "Pusher" $ runEvery (Seconds 2) <~> do
- to avoid ugly messages when a removable drive is not attached. - to avoid ugly messages when a removable drive is not attached.
-} -}
pushTargets :: Assistant [Remote] pushTargets :: Assistant [Remote]
pushTargets = liftIO . filterM available =<< candidates <$> getDaemonStatus pushTargets = liftIO . filterM (Remote.checkAvailable True)
=<< candidates <$> getDaemonStatus
where where
candidates = filter (not . Remote.readonly) . syncGitRemotes candidates = filter (not . Remote.readonly) . syncGitRemotes
available = maybe (return True) doesDirectoryExist . Remote.localpath

View file

@ -0,0 +1,46 @@
{- git-annex assistant remote checker thread
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Assistant.Threads.RemoteChecker (
remoteCheckerThread
) where
import Assistant.Common
import Utility.ThreadScheduler
import Assistant.Types.UrlRenderer
import Assistant.Alert
import Remote
import qualified Types.Remote as Remote
import qualified Git.Fsck
import Assistant.Repair
import qualified Git
import Assistant.RemoteProblem
import Assistant.Sync
import Data.Function
{- Waits for problems with remotes, and tries to fsck the remote and repair
- the problem. -}
remoteCheckerThread :: UrlRenderer -> NamedThread
remoteCheckerThread urlrenderer = namedThread "RemoteChecker" $ forever $ do
mapM_ (handleProblem urlrenderer)
=<< liftIO . filterM (checkAvailable True)
=<< nubremotes <$> getRemoteProblems
liftIO $ threadDelaySeconds (Seconds 60)
where
nubremotes = nubBy ((==) `on` Remote.uuid)
handleProblem :: UrlRenderer -> Remote -> Assistant ()
handleProblem urlrenderer rmt
| Git.repoIsLocal r && not (Git.repoIsLocalUnknown r) = do
fsckresults <- showFscking urlrenderer (Just $ Remote.name rmt) $ tryNonAsync $
Git.Fsck.findBroken True r
whenM (repairWhenNecessary urlrenderer (Remote.uuid rmt) (Just rmt) fsckresults) $
syncRemote rmt
| otherwise = noop
where
r = Remote.repo rmt

View file

@ -0,0 +1,18 @@
{- git-annex assistant remote problem detection
-
- Copyright 2013 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Assistant.Types.RemoteProblem where
import Types
import Utility.TList
import Control.Concurrent.STM
type RemoteProblemChan = TList Remote
newRemoteProblemChan :: IO RemoteProblemChan
newRemoteProblemChan = atomically newTList

View file

@ -39,7 +39,8 @@ module Remote (
showTriedRemotes, showTriedRemotes,
showLocations, showLocations,
forceTrust, forceTrust,
logStatus logStatus,
checkAvailable
) where ) where
import qualified Data.Map as M import qualified Data.Map as M
@ -274,3 +275,7 @@ byCost = map snd . sortBy (comparing fst) . M.toList . costmap
where where
costmap = M.fromListWith (++) . map costpair costmap = M.fromListWith (++) . map costpair
costpair r = (cost r, [r]) costpair r = (cost r, [r])
checkAvailable :: Bool -> Remote -> IO Bool
checkAvailable assumenetworkavailable =
maybe (return assumenetworkavailable) doesDirectoryExist . localpath