git-annex/Assistant/Sync.hs

{- git-annex assistant repo syncing
 -
 - Copyright 2012 Joey Hess <joey@kitenet.net>
 -
 - Licensed under the GNU GPL version 3 or higher.
 -}

module Assistant.Sync where

import Assistant.Common
import Assistant.Pushes
import Assistant.Alert
import Assistant.ThreadedMonad
import Assistant.DaemonStatus
import Assistant.ScanRemotes
import qualified Command.Sync
import Utility.Parallel
import qualified Git
import qualified Git.Branch
import qualified Git.Command
import qualified Remote
import qualified Types.Remote as Remote
import qualified Annex.Branch

import Data.Time.Clock
import qualified Data.Map as M
import Control.Concurrent

{- Syncs with remotes that may have been disconnected for a while.
 - 
 - First gets git in sync, and then prepares any necessary file transfers.
 -
 - An expensive full scan is queued when the git-annex branches of some of
 - the remotes have diverged from the local git-annex branch. Otherwise,
 - it's sufficient to requeue failed transfers.
 -}
reconnectRemotes :: ThreadName -> ThreadState -> DaemonStatusHandle -> ScanRemoteMap -> [Remote] -> IO ()
reconnectRemotes _ _ _ _ [] = noop
reconnectRemotes threadname st dstatus scanremotes rs = void $
	alertWhile dstatus (syncAlert rs) $ do
		(ok, diverged) <- sync
			=<< runThreadState st (inRepo Git.Branch.current)
		addScanRemotes scanremotes diverged rs
		return ok
	where
		(gitremotes, _specialremotes) =
			partition (Git.repoIsUrl . Remote.repo) rs
		sync (Just branch) = do
			diverged <- manualPull st (Just branch) gitremotes
			now <- getCurrentTime
			ok <- pushToRemotes threadname now st Nothing gitremotes
			return (ok, diverged)
		{- No local branch exists yet, but we can try pulling. -}
		sync Nothing = do
			diverged <- manualPull st Nothing gitremotes
			return (True, diverged)

{- Updates the local sync branch, then pushes it to all remotes, in
 - parallel.
 -
 - Avoids running possibly long-duration commands in the Annex monad, so
 - as not to block other threads. -}
pushToRemotes :: ThreadName -> UTCTime -> ThreadState -> (Maybe FailedPushMap) -> [Remote] -> IO Bool
pushToRemotes threadname now st mpushmap remotes = do
	(g, branch) <- runThreadState st $
		(,) <$> fromRepo id <*> inRepo Git.Branch.current
	go True branch g remotes
	where
		go _ Nothing _ _ = return True -- no branch, so nothing to do
		go shouldretry (Just branch) g rs = do
			debug threadname
				[ "pushing to"
				, show rs
				]
			Command.Sync.updateBranch (Command.Sync.syncBranch branch) g
			(succeeded, failed) <- inParallel (push g branch) rs
			let ok = null failed
			case mpushmap of
				Nothing -> noop
				Just pushmap -> 
					changeFailedPushMap pushmap $ \m ->
						M.union (makemap failed) $
							M.difference m (makemap succeeded)
			unless (ok) $
				debug threadname
					[ "failed to push to"
					, show failed
					]
			if (ok || not shouldretry)
				then return ok
				else retry branch g failed

		makemap l = M.fromList $ zip l (repeat now)

		push g branch remote = Command.Sync.pushBranch remote branch g

		retry branch g rs = do
			debug threadname [ "trying manual pull to resolve failed pushes" ]
			void $ manualPull st (Just branch) rs
			go False (Just branch) g rs

{- Manually pull from remotes and merge their branches. -}
manualPull :: ThreadState -> (Maybe Git.Ref) -> [Remote] -> IO Bool
manualPull st currentbranch remotes = do
	g <- runThreadState st $ fromRepo id
	forM_ remotes $ \r ->
		Git.Command.runBool "fetch" [Param $ Remote.name r] g
	haddiverged <- runThreadState st $ Annex.Branch.forceUpdate
	forM_ remotes $ \r ->
		runThreadState st $ Command.Sync.mergeRemote r currentbranch
	return haddiverged

{- Start syncing a newly added remote, using a background thread. -}
syncNewRemote :: ThreadState -> DaemonStatusHandle -> ScanRemoteMap -> Remote -> IO ()
syncNewRemote st dstatus scanremotes remote = do
	runThreadState st $ updateKnownRemotes dstatus
	void $ forkIO $ do reconnectRemotes "SyncRemote" st dstatus scanremotes [remote]
refactor 2012-08-22 18:32:17 +00:00			`{- git-annex assistant repo syncing`
			`-`
			`- Copyright 2012 Joey Hess <joey@kitenet.net>`
			`-`
			`- Licensed under the GNU GPL version 3 or higher.`
			`-}`

			`module Assistant.Sync where`

			`import Assistant.Common`
			`import Assistant.Pushes`
			`import Assistant.Alert`
			`import Assistant.ThreadedMonad`
			`import Assistant.DaemonStatus`
			`import Assistant.ScanRemotes`
			`import qualified Command.Sync`
			`import Utility.Parallel`
			`import qualified Git`
			`import qualified Git.Branch`
			`import qualified Git.Command`
			`import qualified Remote`
improve syncing support for special remotes Avoid trying to git push/pull to special remotes, but still do transfer scans of them, after git pull from any other remotes, so we know about any values that have been placed on them. 2012-09-04 19:54:30 +00:00			`import qualified Types.Remote as Remote`
refactor 2012-08-22 18:32:17 +00:00			`import qualified Annex.Branch`

			`import Data.Time.Clock`
			`import qualified Data.Map as M`
pairing probably works now (untested) 2012-09-11 01:55:59 +00:00			`import Control.Concurrent`
refactor 2012-08-22 18:32:17 +00:00
			`{- Syncs with remotes that may have been disconnected for a while.`
			`-`
keep logs of failed transfers, and requeue them when doing a non-full scan of a remote 2012-08-23 19:22:23 +00:00			`- First gets git in sync, and then prepares any necessary file transfers.`
			`-`
scan multiple remotes in one pass The expensive transfer scan now scans a whole set of remotes in one pass. So at startup, or when network comes up, it will run only once. Note that this can result in transfers from/to higher cost remotes being queued before other transfers of other content from/to lower cost remotes. Before, low cost remotes were scanned first and all their transfers came first. When multiple transfers are queued for a key, the lower cost ones are still queued first. However, this could result in transfers from slow remotes running for a long time while transfers of other data from faster remotes waits. I expect to make the transfer queue smarter about ordering and/or make it allow multiple transfers at a time, which should eliminate this annoyance. (Also, it was already possible to get into that situation, for example if the network was up, lots of transfers from slow remotes might be queued, and then a disk is mounted and its faster transfers have to wait.) Also note that this means I don't need to improve the code in Assistant.Sync that currently checks if any of the reconnected remotes have diverged, and if so, queues scans of all of them. That had been very innefficient, but now doesn't matter. 2012-08-26 18:01:43 +00:00			`- An expensive full scan is queued when the git-annex branches of some of`
			`- the remotes have diverged from the local git-annex branch. Otherwise,`
keep logs of failed transfers, and requeue them when doing a non-full scan of a remote 2012-08-23 19:22:23 +00:00			`- it's sufficient to requeue failed transfers.`
refactor 2012-08-22 18:32:17 +00:00			`-}`
better name 2012-08-22 19:37:26 +00:00			`reconnectRemotes :: ThreadName -> ThreadState -> DaemonStatusHandle -> ScanRemoteMap -> [Remote] -> IO ()`
			`reconnectRemotes _ _ _ _ [] = noop`
			`reconnectRemotes threadname st dstatus scanremotes rs = void $`
avoid unnecessary transfer scans when syncing a disconnected remote Found a very cheap way to determine when a disconnected remote has diverged, and has new content that needs to be transferred: Piggyback on the git-annex branch update, which already checks for divergence. However, this does not check if new content has appeared locally while disconnected, that should be transferred to the remote. Also, this does not handle cases where the two git repos are in sync, but their content syncing has not caught up yet. This code could have its efficiency improved: * When multiple remotes are synced, if any one has diverged, they're all queued for transfer scans. * The transfer scanner could be told whether the remote has new content, the local repo has new content, or both, and could optimise its scan accordingly. 2012-08-22 18:51:11 +00:00			`alertWhile dstatus (syncAlert rs) $ do`
improve syncing support for special remotes Avoid trying to git push/pull to special remotes, but still do transfer scans of them, after git pull from any other remotes, so we know about any values that have been placed on them. 2012-09-04 19:54:30 +00:00			`(ok, diverged) <- sync`
			`=<< runThreadState st (inRepo Git.Branch.current)`
			`addScanRemotes scanremotes diverged rs`
			`return ok`
refactor 2012-08-22 18:32:17 +00:00			`where`
display errors when any named thread crashes 2012-09-06 18:56:04 +00:00			`(gitremotes, _specialremotes) =`
improve syncing support for special remotes Avoid trying to git push/pull to special remotes, but still do transfer scans of them, after git pull from any other remotes, so we know about any values that have been placed on them. 2012-09-04 19:54:30 +00:00			`partition (Git.repoIsUrl . Remote.repo) rs`
refactor 2012-08-22 18:32:17 +00:00			`sync (Just branch) = do`
improve syncing support for special remotes Avoid trying to git push/pull to special remotes, but still do transfer scans of them, after git pull from any other remotes, so we know about any values that have been placed on them. 2012-09-04 19:54:30 +00:00			`diverged <- manualPull st (Just branch) gitremotes`
			`now <- getCurrentTime`
			`ok <- pushToRemotes threadname now st Nothing gitremotes`
			`return (ok, diverged)`
refactor 2012-08-22 18:32:17 +00:00			`{- No local branch exists yet, but we can try pulling. -}`
			`sync Nothing = do`
improve syncing support for special remotes Avoid trying to git push/pull to special remotes, but still do transfer scans of them, after git pull from any other remotes, so we know about any values that have been placed on them. 2012-09-04 19:54:30 +00:00			`diverged <- manualPull st Nothing gitremotes`
			`return (True, diverged)`
refactor 2012-08-22 18:32:17 +00:00
			`{- Updates the local sync branch, then pushes it to all remotes, in`
			`- parallel.`
			`-`
			`- Avoids running possibly long-duration commands in the Annex monad, so`
			`- as not to block other threads. -}`
			`pushToRemotes :: ThreadName -> UTCTime -> ThreadState -> (Maybe FailedPushMap) -> [Remote] -> IO Bool`
			`pushToRemotes threadname now st mpushmap remotes = do`
			`(g, branch) <- runThreadState st $`
			`(,) <$> fromRepo id <*> inRepo Git.Branch.current`
			`go True branch g remotes`
			`where`
			`go _ Nothing _ _ = return True -- no branch, so nothing to do`
			`go shouldretry (Just branch) g rs = do`
			`debug threadname`
			`[ "pushing to"`
			`, show rs`
			`]`
			`Command.Sync.updateBranch (Command.Sync.syncBranch branch) g`
			`(succeeded, failed) <- inParallel (push g branch) rs`
			`let ok = null failed`
			`case mpushmap of`
			`Nothing -> noop`
			`Just pushmap ->`
			`changeFailedPushMap pushmap $ \m ->`
			`M.union (makemap failed) $`
			`M.difference m (makemap succeeded)`
			`unless (ok) $`
			`debug threadname`
			`[ "failed to push to"`
			`, show failed`
			`]`
			`if (ok \|\| not shouldretry)`
			`then return ok`
			`else retry branch g failed`

			`makemap l = M.fromList $ zip l (repeat now)`

			`push g branch remote = Command.Sync.pushBranch remote branch g`

			`retry branch g rs = do`
			`debug threadname [ "trying manual pull to resolve failed pushes" ]`
avoid unnecessary transfer scans when syncing a disconnected remote Found a very cheap way to determine when a disconnected remote has diverged, and has new content that needs to be transferred: Piggyback on the git-annex branch update, which already checks for divergence. However, this does not check if new content has appeared locally while disconnected, that should be transferred to the remote. Also, this does not handle cases where the two git repos are in sync, but their content syncing has not caught up yet. This code could have its efficiency improved: * When multiple remotes are synced, if any one has diverged, they're all queued for transfer scans. * The transfer scanner could be told whether the remote has new content, the local repo has new content, or both, and could optimise its scan accordingly. 2012-08-22 18:51:11 +00:00			`void $ manualPull st (Just branch) rs`
refactor 2012-08-22 18:32:17 +00:00			`go False (Just branch) g rs`

			`{- Manually pull from remotes and merge their branches. -}`
avoid unnecessary transfer scans when syncing a disconnected remote Found a very cheap way to determine when a disconnected remote has diverged, and has new content that needs to be transferred: Piggyback on the git-annex branch update, which already checks for divergence. However, this does not check if new content has appeared locally while disconnected, that should be transferred to the remote. Also, this does not handle cases where the two git repos are in sync, but their content syncing has not caught up yet. This code could have its efficiency improved: * When multiple remotes are synced, if any one has diverged, they're all queued for transfer scans. * The transfer scanner could be told whether the remote has new content, the local repo has new content, or both, and could optimise its scan accordingly. 2012-08-22 18:51:11 +00:00			`manualPull :: ThreadState -> (Maybe Git.Ref) -> [Remote] -> IO Bool`
move some git operations outside the annex monad to avoid blocking other threads 2012-08-22 18:36:58 +00:00			`manualPull st currentbranch remotes = do`
			`g <- runThreadState st $ fromRepo id`
refactor 2012-08-22 18:32:17 +00:00			`forM_ remotes $ \r ->`
move some git operations outside the annex monad to avoid blocking other threads 2012-08-22 18:36:58 +00:00			`Git.Command.runBool "fetch" [Param $ Remote.name r] g`
avoid unnecessary transfer scans when syncing a disconnected remote Found a very cheap way to determine when a disconnected remote has diverged, and has new content that needs to be transferred: Piggyback on the git-annex branch update, which already checks for divergence. However, this does not check if new content has appeared locally while disconnected, that should be transferred to the remote. Also, this does not handle cases where the two git repos are in sync, but their content syncing has not caught up yet. This code could have its efficiency improved: * When multiple remotes are synced, if any one has diverged, they're all queued for transfer scans. * The transfer scanner could be told whether the remote has new content, the local repo has new content, or both, and could optimise its scan accordingly. 2012-08-22 18:51:11 +00:00			`haddiverged <- runThreadState st $ Annex.Branch.forceUpdate`
refactor 2012-08-22 18:32:17 +00:00			`forM_ remotes $ \r ->`
move some git operations outside the annex monad to avoid blocking other threads 2012-08-22 18:36:58 +00:00			`runThreadState st $ Command.Sync.mergeRemote r currentbranch`
avoid unnecessary transfer scans when syncing a disconnected remote Found a very cheap way to determine when a disconnected remote has diverged, and has new content that needs to be transferred: Piggyback on the git-annex branch update, which already checks for divergence. However, this does not check if new content has appeared locally while disconnected, that should be transferred to the remote. Also, this does not handle cases where the two git repos are in sync, but their content syncing has not caught up yet. This code could have its efficiency improved: * When multiple remotes are synced, if any one has diverged, they're all queued for transfer scans. * The transfer scanner could be told whether the remote has new content, the local repo has new content, or both, and could optimise its scan accordingly. 2012-08-22 18:51:11 +00:00			`return haddiverged`
pairing probably works now (untested) 2012-09-11 01:55:59 +00:00
			`{- Start syncing a newly added remote, using a background thread. -}`
			`syncNewRemote :: ThreadState -> DaemonStatusHandle -> ScanRemoteMap -> Remote -> IO ()`
			`syncNewRemote st dstatus scanremotes remote = do`
			`runThreadState st $ updateKnownRemotes dstatus`
			`void $ forkIO $ do reconnectRemotes "SyncRemote" st dstatus scanremotes [remote]`