resume interrupted chunked downloads

Leverage the new chunked remotes to automatically resume downloads.
Sort of like rsync, although of course not as efficient since this
needs to start at a chunk boundry.

But, unlike rsync, this method will work for S3, WebDAV, external
special remotes, etc, etc. Only directory special remotes so far,
but many more soon!

This implementation will also properly handle starting a download
from one remote, interrupting, and resuming from another one, and so on.

(Resuming interrupted chunked uploads is similarly doable, although
slightly more expensive.)

This commit was sponsored by Thomas Djärv.
This commit is contained in:
Joey Hess 2014-07-27 18:52:42 -04:00
parent 13bbb61a51
commit 9d4a766cd7
6 changed files with 88 additions and 32 deletions

View file

@ -1,6 +1,6 @@
{- git-annex chunked remotes
-
- Copyright 2012-2014 Joey Hess <joey@kitenet.net>
- Copyright 2014 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -27,6 +27,7 @@ import Annex.Exception
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M
import Control.Exception
data ChunkConfig
= NoChunks
@ -147,12 +148,16 @@ removeChunks remover u chunkconfig encryptor k = do
-
- When the remote is chunked, tries each of the options returned by
- chunkKeys until it finds one where the retriever successfully
- gets the first key in the list. The content of that key, and any
- gets the first chunked key. The content of that key, and any
- other chunks in the list is fed to the sink.
-
- If retrival of one of the subsequent chunks throws an exception,
- gives up and returns False. Note that partial data may have been
- written to the sink in this case.
-
- Resuming is supported when using chunks. When the destination file
- already exists, it skips to the next chunked key that would be needed
- to resume.
-}
retrieveChunks
:: (Key -> IO L.ByteString)
@ -160,43 +165,88 @@ retrieveChunks
-> ChunkConfig
-> EncKey
-> Key
-> FilePath
-> MeterUpdate
-> (MeterUpdate -> L.ByteString -> IO ())
-> (Handle -> MeterUpdate -> L.ByteString -> IO ())
-> Annex Bool
retrieveChunks retriever u chunkconfig encryptor basek basep sink
retrieveChunks retriever u chunkconfig encryptor basek dest basep sink
| noChunks chunkconfig =
-- Optimisation: Try the unchunked key first, to avoid
-- looking in the git-annex branch for chunk counts.
liftIO (retriever (encryptor basek) >>= sink basep >> return True)
`catchNonAsyncAnnex`
const (go =<< chunkKeysOnly u basek)
-- looking in the git-annex branch for chunk counts
-- that are likely not there.
getunchunked `catchNonAsyncAnnex`
const (go =<< chunkKeysOnly u basek)
| otherwise = go =<< chunkKeys u chunkconfig basek
where
go ls = liftIO $ firstavail ls `catchNonAsync` giveup
go ls = liftIO $ do
currsize <- catchMaybeIO $
toInteger . fileSize <$> getFileStatus dest
let ls' = maybe ls (setupResume ls) currsize
firstavail currsize ls' `catchNonAsync` giveup
giveup e = do
warningIO (show e)
return False
firstavail [] = return False
firstavail ([]:ls) = firstavail ls
firstavail ((k:ks):ls) = do
firstavail _ [] = return False
firstavail currsize ([]:ls) = firstavail currsize ls
firstavail currsize ((k:ks):ls) = do
v <- tryNonAsync $ retriever (encryptor k)
case v of
Left e
| null ls -> giveup e
| otherwise -> firstavail ls
| otherwise -> firstavail currsize ls
Right b -> do
sink basep b
let sz = toBytesProcessed $
fromMaybe 0 $ keyChunkSize k
getrest sz sz ks
let offset = resumeOffset currsize k
let p = maybe basep
(offsetMeterUpdate basep . toBytesProcessed)
offset
bracket (maybe opennew openresume offset) hClose $ \h -> do
sink h p b
let sz = toBytesProcessed $
fromMaybe 0 $ keyChunkSize k
getrest p h sz sz ks
getrest _ _ [] = return True
getrest sz bytesprocessed (k:ks) = do
let p = offsetMeterUpdate basep bytesprocessed
sink p =<< retriever (encryptor k)
getrest sz (addBytesProcessed bytesprocessed sz) ks
getrest _ _ _ _ [] = return True
getrest p h sz bytesprocessed (k:ks) = do
let p' = offsetMeterUpdate p bytesprocessed
sink h p' =<< retriever (encryptor k)
getrest p h sz (addBytesProcessed bytesprocessed sz) ks
getunchunked = liftIO $ bracket opennew hClose $ \h -> do
retriever (encryptor basek) >>= sink h basep
return True
opennew = openBinaryFile dest WriteMode
-- Open the file and seek to the start point in order to resume.
openresume startpoint = do
-- ReadWriteMode allows seeking; AppendMode does not.
h <- openBinaryFile dest ReadWriteMode
hSeek h AbsoluteSeek startpoint
return h
{- Can resume when the chunk's offset is at or before the end of
- the dest file. -}
resumeOffset :: Maybe Integer -> Key -> Maybe Integer
resumeOffset Nothing _ = Nothing
resumeOffset currsize k
| offset <= currsize = offset
| otherwise = Nothing
where
offset = chunkKeyOffset k
{- Drops chunks that are already present in a file, based on its size.
- Keeps any non-chunk keys.
-}
setupResume :: [[Key]] -> Integer -> [[Key]]
setupResume ls currsize = map dropunneeded ls
where
dropunneeded [] = []
dropunneeded l@(k:_) = case keyChunkSize k of
Just chunksize | chunksize > 0 ->
genericDrop (currsize `div` chunksize) l
_ -> l
{- Checks if a key is present in a remote. This requires any one
- of the lists of options returned by chunkKeys to all check out
@ -212,7 +262,8 @@ hasKeyChunks
hasKeyChunks checker u chunkconfig encryptor basek
| noChunks chunkconfig =
-- Optimisation: Try the unchunked key first, to avoid
-- looking in the git-annex branch for chunk counts.
-- looking in the git-annex branch for chunk counts
-- that are likely not there.
ifM ((Right True ==) <$> checker (encryptor basek))
( return (Right True)
, checklists impossible =<< chunkKeysOnly u basek

View file

@ -103,9 +103,7 @@ chunkedEncryptableRemote c preparestorer prepareretriever r = encr
safely $ prepareretriever k $ safely . go
where
go (Just retriever) = metered (Just p) k $ \p' ->
bracketIO (openBinaryFile dest WriteMode) hClose $ \h ->
retrieveChunks retriever (uuid r) chunkconfig enck k p' $
sink h
retrieveChunks retriever (uuid r) chunkconfig enck k dest p' sink
go Nothing = return False
sink h p' b = do
let write = meteredWrite p' h

View file

@ -13,8 +13,8 @@ module Types.Key (
stubKey,
key2file,
file2key,
isChunkKey,
nonChunkKey,
chunkKeyOffset,
prop_idempotent_key_encode,
prop_idempotent_key_decode
@ -49,9 +49,6 @@ stubKey = Key
, keyChunkNum = Nothing
}
isChunkKey :: Key -> Bool
isChunkKey k = isJust (keyChunkSize k) && isJust (keyChunkNum k)
-- Gets the parent of a chunk key.
nonChunkKey :: Key -> Key
nonChunkKey k = k
@ -59,6 +56,12 @@ nonChunkKey k = k
, keyChunkNum = Nothing
}
-- Where a chunk key is offset within its parent.
chunkKeyOffset :: Key -> Maybe Integer
chunkKeyOffset k = (*)
<$> keyChunkSize k
<*> (pred <$> keyChunkNum k)
fieldSep :: Char
fieldSep = '-'

View file

@ -88,7 +88,7 @@ meteredWriteFile meterupdate f b = withBinaryFile f WriteMode $ \h ->
{- Applies an offset to a MeterUpdate. This can be useful when
- performing a sequence of actions, such as multiple meteredWriteFiles,
- that all update a common meter progressively.
- that all update a common meter progressively. Or when resuming.
-}
offsetMeterUpdate :: MeterUpdate -> BytesProcessed -> MeterUpdate
offsetMeterUpdate base offset = \n -> base (offset `addBytesProcessed` n)

4
debian/changelog vendored
View file

@ -1,7 +1,9 @@
git-annex (5.20140718) UNRELEASED; urgency=medium
* New chunk= option to chunk files stored in directory remotes.
* The old chunksize= option is deprecated. Do not use for new remotes!
* Partially transferred files are automatically resumed when using
chunked remotes!
* The old chunksize= option is deprecated. Do not use for new remotes.
* Legacy code for directory remotes using the old chunksize= option
will keep them working, but more slowly than before.
* webapp: Automatically install Konqueror integration scripts

View file

@ -4,6 +4,8 @@ chunks that are stored on the remote.
This can be useful to work around limitations on the size of files
on the remote.
Chunking also allows for resuming interrupted downloads and uploads.
Note that git-annex has to buffer chunks in memory before they are sent to
a remote. So, using a large chunk size will make it use more memory.