Merge branch 'master' into assistant
This commit is contained in:
commit
b0894f00c0
11 changed files with 284 additions and 55 deletions
|
@ -1,6 +1,6 @@
|
|||
{- git-annex SHA backend
|
||||
-
|
||||
- Copyright 2011 Joey Hess <joey@kitenet.net>
|
||||
- Copyright 2011,2012 Joey Hess <joey@kitenet.net>
|
||||
-
|
||||
- Licensed under the GNU GPL version 3 or higher.
|
||||
-}
|
||||
|
@ -12,7 +12,10 @@ import qualified Annex
|
|||
import Types.Backend
|
||||
import Types.Key
|
||||
import Types.KeySource
|
||||
|
||||
import qualified Build.SysConfig as SysConfig
|
||||
import Data.Digest.Pure.SHA
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
|
||||
type SHASize = Int
|
||||
|
||||
|
@ -25,32 +28,19 @@ backends :: [Backend]
|
|||
backends = catMaybes $ map genBackend sizes ++ map genBackendE sizes
|
||||
|
||||
genBackend :: SHASize -> Maybe Backend
|
||||
genBackend size
|
||||
| isNothing (shaCommand size) = Nothing
|
||||
| otherwise = Just b
|
||||
where
|
||||
b = Backend
|
||||
{ name = shaName size
|
||||
, getKey = keyValue size
|
||||
, fsckKey = Just $ checkKeyChecksum size
|
||||
}
|
||||
genBackend size = Just $ Backend
|
||||
{ name = shaName size
|
||||
, getKey = keyValue size
|
||||
, fsckKey = Just $ checkKeyChecksum size
|
||||
}
|
||||
|
||||
genBackendE :: SHASize -> Maybe Backend
|
||||
genBackendE size =
|
||||
case genBackend size of
|
||||
Nothing -> Nothing
|
||||
Just b -> Just $ b
|
||||
{ name = shaNameE size
|
||||
, getKey = keyValueE size
|
||||
}
|
||||
|
||||
shaCommand :: SHASize -> Maybe String
|
||||
shaCommand 1 = SysConfig.sha1
|
||||
shaCommand 256 = Just SysConfig.sha256
|
||||
shaCommand 224 = SysConfig.sha224
|
||||
shaCommand 384 = SysConfig.sha384
|
||||
shaCommand 512 = SysConfig.sha512
|
||||
shaCommand _ = Nothing
|
||||
genBackendE size = do
|
||||
b <- genBackend size
|
||||
return $ b
|
||||
{ name = shaNameE size
|
||||
, getKey = keyValueE size
|
||||
}
|
||||
|
||||
shaName :: SHASize -> String
|
||||
shaName size = "SHA" ++ show size
|
||||
|
@ -58,27 +48,48 @@ shaName size = "SHA" ++ show size
|
|||
shaNameE :: SHASize -> String
|
||||
shaNameE size = shaName size ++ "E"
|
||||
|
||||
shaN :: SHASize -> FilePath -> Annex String
|
||||
shaN size file = do
|
||||
shaN :: SHASize -> FilePath -> Integer -> Annex String
|
||||
shaN shasize file filesize = do
|
||||
showAction "checksum"
|
||||
liftIO $ pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do
|
||||
sha <- fst . separate (== ' ') <$> hGetLine h
|
||||
if null sha
|
||||
then error $ command ++ " parse error"
|
||||
else return sha
|
||||
case shaCommand shasize filesize of
|
||||
Left sha -> liftIO $ sha <$> L.readFile file
|
||||
Right command -> liftIO $ runcommand command
|
||||
where
|
||||
command = fromJust $ shaCommand size
|
||||
runcommand command =
|
||||
pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do
|
||||
sha <- fst . separate (== ' ') <$> hGetLine h
|
||||
if null sha
|
||||
then error $ command ++ " parse error"
|
||||
else return sha
|
||||
|
||||
shaCommand :: SHASize -> Integer -> Either (L.ByteString -> String) String
|
||||
shaCommand shasize filesize
|
||||
| shasize == 1 = use SysConfig.sha1 sha1
|
||||
| shasize == 256 = use SysConfig.sha256 sha256
|
||||
| shasize == 224 = use SysConfig.sha224 sha224
|
||||
| shasize == 384 = use SysConfig.sha384 sha384
|
||||
| shasize == 512 = use SysConfig.sha512 sha512
|
||||
| otherwise = error $ "bad sha size " ++ show shasize
|
||||
where
|
||||
use Nothing sha = Left $ showDigest . sha
|
||||
use (Just c) sha
|
||||
-- use builtin, but slower sha for small files
|
||||
-- benchmarking indicates it's faster up to
|
||||
-- and slightly beyond 50 kb files
|
||||
| filesize < 51200 = use Nothing sha
|
||||
| otherwise = Right c
|
||||
|
||||
{- A key is a checksum of its contents. -}
|
||||
keyValue :: SHASize -> KeySource -> Annex (Maybe Key)
|
||||
keyValue size source = do
|
||||
keyValue shasize source = do
|
||||
let file = contentLocation source
|
||||
s <- shaN size file
|
||||
stat <- liftIO $ getFileStatus file
|
||||
let filesize = fromIntegral $ fileSize stat
|
||||
s <- shaN shasize file filesize
|
||||
return $ Just $ stubKey
|
||||
{ keyName = s
|
||||
, keyBackendName = shaName size
|
||||
, keySize = Just $ fromIntegral $ fileSize stat
|
||||
, keyBackendName = shaName shasize
|
||||
, keySize = Just filesize
|
||||
}
|
||||
|
||||
{- Extension preserving keys. -}
|
||||
|
@ -101,10 +112,12 @@ keyValueE size source = keyValue size source >>= maybe (return Nothing) addE
|
|||
checkKeyChecksum :: SHASize -> Key -> FilePath -> Annex Bool
|
||||
checkKeyChecksum size key file = do
|
||||
fast <- Annex.getState Annex.fast
|
||||
present <- liftIO $ doesFileExist file
|
||||
if not present || fast
|
||||
then return True
|
||||
else check <$> shaN size file
|
||||
mstat <- liftIO $ catchMaybeIO $ getFileStatus file
|
||||
case (mstat, fast) of
|
||||
(Just stat, False) -> do
|
||||
let filesize = fromIntegral $ fileSize stat
|
||||
check <$> shaN size file filesize
|
||||
_ -> return True
|
||||
where
|
||||
check s
|
||||
| s == dropExtension (keyName key) = True
|
||||
|
|
|
@ -28,15 +28,14 @@ tests =
|
|||
, TestCase "gpg" $ testCmd "gpg" "gpg --version >/dev/null"
|
||||
, TestCase "lsof" $ testCmd "lsof" "lsof -v >/dev/null 2>&1"
|
||||
, TestCase "ssh connection caching" getSshConnectionCaching
|
||||
] ++ shaTestCases False [1, 512, 224, 384] ++ shaTestCases True [256]
|
||||
] ++ shaTestCases [1, 256, 512, 224, 384]
|
||||
|
||||
shaTestCases :: Bool -> [Int] -> [TestCase]
|
||||
shaTestCases required l = map make l
|
||||
shaTestCases :: [Int] -> [TestCase]
|
||||
shaTestCases l = map make l
|
||||
where
|
||||
make n = TestCase key $ selector key (shacmds n) "</dev/null"
|
||||
make n = TestCase key $ maybeSelectCmd key (shacmds n) "</dev/null"
|
||||
where
|
||||
key = "sha" ++ show n
|
||||
selector = if required then selectCmd else maybeSelectCmd
|
||||
shacmds n = concatMap (\x -> [x, osxpath </> x]) $
|
||||
map (\x -> "sha" ++ show n ++ x) ["", "sum"]
|
||||
-- Max OSX puts GNU tools outside PATH, so look in
|
||||
|
|
5
debian/changelog
vendored
5
debian/changelog
vendored
|
@ -4,6 +4,11 @@ git-annex (3.20120630) UNRELEASED; urgency=low
|
|||
transfer is already in progress by another process.
|
||||
* status: Lists transfers that are currently in progress.
|
||||
* Fix passing --uuid to git-annex-shell.
|
||||
* When shaNsum commands cannot be found, use the Haskell SHA library
|
||||
(already a dependency) to do the checksumming. This may be slower,
|
||||
but avoids portability problems.
|
||||
* Use SHA library for files less than 50 kb in size, at which point it's
|
||||
faster than forking the more optimised external program.
|
||||
|
||||
-- Joey Hess <joeyh@debian.org> Sun, 01 Jul 2012 15:04:37 -0400
|
||||
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
[[!comment format=mdwn
|
||||
username="http://joeyh.name/"
|
||||
subject="comment 3"
|
||||
date="2012-07-04T12:32:44Z"
|
||||
content="""
|
||||
Jimmy, sounds like I could use something like this to get the current limit:
|
||||
|
||||
sysctl kern.maxfilesperproc
|
||||
|
||||
Probably prints \"sysctl kern.maxfilesperproc = 256\" or such.. can you verify?
|
||||
Once I have the limit, I can make the kqueue code use subset of it, and print out a message when it needs to be increased, like the inotify code does.
|
||||
|
||||
(Also, the kqueue code only opens directories, not files, so unless you have 400000 directories, that's
|
||||
a little high.)
|
||||
|
||||
---
|
||||
|
||||
On file removal not propigating, does this still happen? When you remove a file does a git commit automatically happen, or is that broken with kqueue?
|
||||
"""]]
|
14
doc/bugs/git_annex_du.mdwn
Normal file
14
doc/bugs/git_annex_du.mdwn
Normal file
|
@ -0,0 +1,14 @@
|
|||
We need a way to calculate space taken by certain files.
|
||||
|
||||
Use cases: I want to drop some files from my small disk. I need to figure out things that take most space, and drop them.
|
||||
|
||||
Usage examples:
|
||||
|
||||
git annex du -hs *.mp3
|
||||
git annex du -sBm --in=here *.ogg
|
||||
|
||||
Would be nice if it was compatible with standard unix `df`.
|
||||
|
||||
> `du -L` works.
|
||||
>
|
||||
> See also: [[forum/Wishlist:_getting_the_disk_used_by_a_subtree_of_files]]
|
|
@ -0,0 +1,34 @@
|
|||
Well, sometimes you just have to go for the hack. Trying to find a way
|
||||
to add additional options to git-annex-shell without breaking backwards
|
||||
compatability, I noticed that it ignores all options after `--`, because
|
||||
those tend to be random rsync options due to the way rsync runs it.
|
||||
|
||||
So, I've added a new class of options, that come in between, like
|
||||
`-- opt=val opt=val ... --`
|
||||
|
||||
The parser for these will not choke on unknown options, unlike normal
|
||||
getopt. So this let me add the additional info I needed to
|
||||
pass to git-annex-shell to make it record transfer information. And
|
||||
if I need to pass more info in the future, that's covered too.
|
||||
|
||||
It's ugly, but since only git-annex runs git-annex-shell, this is an
|
||||
ugliness only I (and now you, dear reader) have to put up with.
|
||||
|
||||
Note to self: Command-line programs are sometimes an API, particularly
|
||||
if designed to be called remotely, and so it makes sense consider
|
||||
whether they are, and design expandability into them from day 1.
|
||||
|
||||
---
|
||||
|
||||
Anyway, we now have full transfer tracking in git-annex! Both sides of
|
||||
a transfer know what's being transferred, and from where, and have
|
||||
the info necessary to interrupt the transfer.
|
||||
|
||||
---
|
||||
|
||||
Also did some basic groundwork, adding a queue of transfers to perform,
|
||||
and adding to the daemon's status information a map of currently running
|
||||
transfers.
|
||||
|
||||
Next up: The daemon will use inotify to notice new and deleted transfer
|
||||
info files, and update its status info.
|
25
doc/design/assistant/blog/day_23__transfer_watching.mdwn
Normal file
25
doc/design/assistant/blog/day_23__transfer_watching.mdwn
Normal file
|
@ -0,0 +1,25 @@
|
|||
Starting to travel, so limited time today.
|
||||
|
||||
Yet Another Thread added to the assistant, all it does is watch for changes
|
||||
to transfer information files, and update the assistant's map of transfers
|
||||
currently in progress. Now the assistant will know if some other repository
|
||||
has connected to the local repo and is sending or receiving a file's
|
||||
content.
|
||||
|
||||
This seemed really simple to write, it's just 78 lines of code. It worked
|
||||
100% correctly the first time. :) But it's only so easy because I've got
|
||||
this shiny new inotify hammer that I keep finding places to use in the
|
||||
assistant.
|
||||
|
||||
Also, the new thread does some things that caused a similar thread (the
|
||||
merger thread) to go into a MVar deadlock. Luckily, I spent much of
|
||||
[day 19](day_19__random_improvements) investigating and fixing that
|
||||
deadlock, even though it was not a problem at the time.
|
||||
|
||||
So, good.. I'm doing things right and getting to a place where rather
|
||||
nontrivial features can be added easily.
|
||||
|
||||
--
|
||||
|
||||
Next up: Enough nonsense with tracking tranfers... Time to start actually
|
||||
transferring content around!
|
99
doc/design/assistant/blog/day_24__airport_digressions.mdwn
Normal file
99
doc/design/assistant/blog/day_24__airport_digressions.mdwn
Normal file
|
@ -0,0 +1,99 @@
|
|||
In a series of airport layovers all day. Since I woke up at 3:45 am,
|
||||
didn't feel up to doing serious new work, so instead I worked through some
|
||||
OSX support backlog.
|
||||
|
||||
git-annex will now use Haskell's SHA library if the `sha256sum`
|
||||
command is not available. That library is slow, but it's guaranteed to be
|
||||
available; git-annex already depended on it to calculate HMACs.
|
||||
|
||||
Then I decided to see if it makes sense to use the SHA library
|
||||
when adding smaller files. At some point, its slower implementation should
|
||||
win over needing to fork and parse the output of `sha256sum`. This was
|
||||
the first time I tried out Haskell's
|
||||
[Criterion](http://hackage.haskell.org/package/criterion) benchmarker,
|
||||
and I built this simple benchmark in short order.
|
||||
|
||||
[[!format haskell """
|
||||
import Data.Digest.Pure.SHA
|
||||
import Data.ByteString.Lazy as L
|
||||
import Criterion.Main
|
||||
import Common
|
||||
|
||||
testfile :: FilePath
|
||||
testfile = "/tmp/bar" -- on ram disk
|
||||
|
||||
main = defaultMain
|
||||
[ bgroup "sha256"
|
||||
[ bench "internal" $ whnfIO internal
|
||||
, bench "external" $ whnfIO external
|
||||
]
|
||||
]
|
||||
|
||||
internal :: IO String
|
||||
internal = showDigest . sha256 <$> L.readFile testfile
|
||||
|
||||
external :: IO String
|
||||
external = pOpen ReadFromPipe "sha256sum" [testfile] $ \h ->
|
||||
fst . separate (== ' ') <$> hGetLine h
|
||||
"""]]
|
||||
|
||||
The nice thing about benchmarking in Airports is when you're running a
|
||||
benchmark locally, you don't want to do anything else with the computer,
|
||||
so can alternate people watching, spacing out, and analizing results.
|
||||
|
||||
100 kb file:
|
||||
|
||||
benchmarking sha256/internal
|
||||
mean: 15.64729 ms, lb 15.29590 ms, ub 16.10119 ms, ci 0.950
|
||||
std dev: 2.032476 ms, lb 1.638016 ms, ub 2.527089 ms, ci 0.950
|
||||
|
||||
benchmarking sha256/external
|
||||
mean: 8.217700 ms, lb 7.931324 ms, ub 8.568805 ms, ci 0.950
|
||||
std dev: 1.614786 ms, lb 1.357791 ms, ub 2.009682 ms, ci 0.950
|
||||
|
||||
75 kb file:
|
||||
|
||||
benchmarking sha256/internal
|
||||
mean: 12.16099 ms, lb 11.89566 ms, ub 12.50317 ms, ci 0.950
|
||||
std dev: 1.531108 ms, lb 1.232353 ms, ub 1.929141 ms, ci 0.950
|
||||
|
||||
benchmarking sha256/external
|
||||
mean: 8.818731 ms, lb 8.425744 ms, ub 9.269550 ms, ci 0.950
|
||||
std dev: 2.158530 ms, lb 1.916067 ms, ub 2.487242 ms, ci 0.950
|
||||
|
||||
50 kb file:
|
||||
|
||||
benchmarking sha256/internal
|
||||
mean: 7.699274 ms, lb 7.560254 ms, ub 7.876605 ms, ci 0.950
|
||||
std dev: 801.5292 us, lb 655.3344 us, ub 990.4117 us, ci 0.950
|
||||
|
||||
benchmarking sha256/external
|
||||
mean: 8.715779 ms, lb 8.330540 ms, ub 9.102232 ms, ci 0.950
|
||||
std dev: 1.988089 ms, lb 1.821582 ms, ub 2.181676 ms, ci 0.950
|
||||
|
||||
10 kb file:
|
||||
|
||||
benchmarking sha256/internal
|
||||
mean: 1.586105 ms, lb 1.574512 ms, ub 1.604922 ms, ci 0.950
|
||||
std dev: 74.07235 us, lb 51.71688 us, ub 108.1348 us, ci 0.950
|
||||
|
||||
benchmarking sha256/external
|
||||
mean: 6.873742 ms, lb 6.582765 ms, ub 7.252911 ms, ci 0.950
|
||||
std dev: 1.689662 ms, lb 1.346310 ms, ub 2.640399 ms, ci 0.950
|
||||
|
||||
It's possible to get nice graphical reports out of Criterion, but
|
||||
this is clear enough, so I stopped here. 50 kb seems a reasonable
|
||||
cutoff point.
|
||||
|
||||
I also used this to benchmark the SHA256 in Haskell's Crypto package.
|
||||
Surprisingly, it's a *lot* slower than even the Pure.SHA code.
|
||||
On a 50 kb file:
|
||||
|
||||
benchmarking sha256/Crypto
|
||||
collecting 100 samples, 1 iterations each, in estimated 6.073809 s
|
||||
mean: 69.89037 ms, lb 69.15831 ms, ub 70.71845 ms, ci 0.950
|
||||
std dev: 3.995397 ms, lb 3.435775 ms, ub 4.721952 ms, ci 0.950
|
||||
|
||||
|
||||
There's another Haskell library, [SHA2](http://hackage.haskell.org/package/SHA2),
|
||||
which I should try some time.
|
|
@ -10,14 +10,15 @@ all the other git clones, at both the git level and the key/value level.
|
|||
* transfer info for git-annex-shell **done**
|
||||
* update files as transfers proceed. See [[progressbars]]
|
||||
(updating for downloads is easy; for uploads is hard)
|
||||
* add Transfer queue TChan
|
||||
* add Transfer queue TChan **done**
|
||||
* add TransferInfo Map to DaemonStatus for tracking transfers in progress.
|
||||
**done**
|
||||
* Poll transfer in progress info files for changes (use inotify again!
|
||||
wow! hammer, meet nail..), and update the TransferInfo Map **done**
|
||||
* enqueue Transfers (Uploads) as new files are added to the annex by
|
||||
Watcher.
|
||||
* enqueue Tranferrs (Downloads) as new dangling symlinks are noticed by
|
||||
Watcher.
|
||||
* add TransferInfo Map to DaemonStatus for tracking transfers in progress.
|
||||
* Poll transfer in progress info files for changes (use inotify again!
|
||||
wow! hammer, meet nail..), and update the TransferInfo Map
|
||||
* Write basic Transfer handling thread. Multiple such threads need to be
|
||||
able to be run at once. Each will need its own independant copy of the
|
||||
Annex state monad.
|
||||
|
@ -52,6 +53,9 @@ all the other git clones, at both the git level and the key/value level.
|
|||
signaling a change out of band.
|
||||
4. Add a hook, so when there's a change to sync, a program can be run
|
||||
and do its own signaling.
|
||||
5. --debug will show often unnecessary work being done. Optimise.
|
||||
6. It would be nice if, when a USB drive is connected,
|
||||
syncing starts automatically. Use dbus on Linux?
|
||||
|
||||
## misc todo
|
||||
|
||||
|
@ -90,12 +94,11 @@ anyway.
|
|||
* Transfer threads started/stopped as necessary to move data.
|
||||
(May sometimes want multiple threads downloading, or uploading, or even both.)
|
||||
|
||||
type TransferQueue = TChan [Transfer]
|
||||
-- add (M.Map Transfer TransferInfo) to DaemonStatus
|
||||
startTransfer :: TransferQueue -> Transfer -> Annex ()
|
||||
startTransfer q transfer = error "TODO"
|
||||
|
||||
startTransfer :: Transfer -> Annex TransferID
|
||||
|
||||
stopTransfer :: TransferID -> IO ()
|
||||
stopTransfer :: TransferQueue -> TransferID -> Annex ()
|
||||
stopTransfer q transfer = error "TODO"
|
||||
|
||||
The assistant needs to find out when `git-annex-shell` is receiving or
|
||||
sending (triggered by another remote), so it can add data for those too.
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
[[!comment format=mdwn
|
||||
username="http://joeyh.name/"
|
||||
subject="comment 14"
|
||||
date="2012-07-04T12:43:54Z"
|
||||
content="""
|
||||
@Damien, hmm, it should not be using any cp options, unless when it was built there was a cp in the path that supported some option like -p. Can you check with --debug what cp parameters it's trying to use?
|
||||
|
||||
|
||||
"""]]
|
|
@ -0,0 +1,9 @@
|
|||
[[!comment format=mdwn
|
||||
username="http://joeyh.name/"
|
||||
subject="comment 15"
|
||||
date="2012-07-04T13:14:00Z"
|
||||
content="""
|
||||
git-annex will now fall back to slower pure Haskell hashing code if `sha256sum`, etc programs are not in PATH. I'd still recommend installing the coreutils, as they're probably faster.
|
||||
|
||||
(The `shasum` command seems to come from a perl library, so I have not tried to make git-annex use that one.)
|
||||
"""]]
|
Loading…
Reference in a new issue