Merge branch 'master' into assistant

This commit is contained in:
Joey Hess 2012-07-05 09:12:54 -06:00
commit b0894f00c0
11 changed files with 284 additions and 55 deletions

View file

@ -1,6 +1,6 @@
{- git-annex SHA backend
-
- Copyright 2011 Joey Hess <joey@kitenet.net>
- Copyright 2011,2012 Joey Hess <joey@kitenet.net>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -12,7 +12,10 @@ import qualified Annex
import Types.Backend
import Types.Key
import Types.KeySource
import qualified Build.SysConfig as SysConfig
import Data.Digest.Pure.SHA
import qualified Data.ByteString.Lazy as L
type SHASize = Int
@ -25,32 +28,19 @@ backends :: [Backend]
backends = catMaybes $ map genBackend sizes ++ map genBackendE sizes
genBackend :: SHASize -> Maybe Backend
genBackend size
| isNothing (shaCommand size) = Nothing
| otherwise = Just b
where
b = Backend
{ name = shaName size
, getKey = keyValue size
, fsckKey = Just $ checkKeyChecksum size
}
genBackend size = Just $ Backend
{ name = shaName size
, getKey = keyValue size
, fsckKey = Just $ checkKeyChecksum size
}
genBackendE :: SHASize -> Maybe Backend
genBackendE size =
case genBackend size of
Nothing -> Nothing
Just b -> Just $ b
{ name = shaNameE size
, getKey = keyValueE size
}
shaCommand :: SHASize -> Maybe String
shaCommand 1 = SysConfig.sha1
shaCommand 256 = Just SysConfig.sha256
shaCommand 224 = SysConfig.sha224
shaCommand 384 = SysConfig.sha384
shaCommand 512 = SysConfig.sha512
shaCommand _ = Nothing
genBackendE size = do
b <- genBackend size
return $ b
{ name = shaNameE size
, getKey = keyValueE size
}
shaName :: SHASize -> String
shaName size = "SHA" ++ show size
@ -58,27 +48,48 @@ shaName size = "SHA" ++ show size
shaNameE :: SHASize -> String
shaNameE size = shaName size ++ "E"
shaN :: SHASize -> FilePath -> Annex String
shaN size file = do
shaN :: SHASize -> FilePath -> Integer -> Annex String
shaN shasize file filesize = do
showAction "checksum"
liftIO $ pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do
sha <- fst . separate (== ' ') <$> hGetLine h
if null sha
then error $ command ++ " parse error"
else return sha
case shaCommand shasize filesize of
Left sha -> liftIO $ sha <$> L.readFile file
Right command -> liftIO $ runcommand command
where
command = fromJust $ shaCommand size
runcommand command =
pOpen ReadFromPipe command (toCommand [File file]) $ \h -> do
sha <- fst . separate (== ' ') <$> hGetLine h
if null sha
then error $ command ++ " parse error"
else return sha
shaCommand :: SHASize -> Integer -> Either (L.ByteString -> String) String
shaCommand shasize filesize
| shasize == 1 = use SysConfig.sha1 sha1
| shasize == 256 = use SysConfig.sha256 sha256
| shasize == 224 = use SysConfig.sha224 sha224
| shasize == 384 = use SysConfig.sha384 sha384
| shasize == 512 = use SysConfig.sha512 sha512
| otherwise = error $ "bad sha size " ++ show shasize
where
use Nothing sha = Left $ showDigest . sha
use (Just c) sha
-- use builtin, but slower sha for small files
-- benchmarking indicates it's faster up to
-- and slightly beyond 50 kb files
| filesize < 51200 = use Nothing sha
| otherwise = Right c
{- A key is a checksum of its contents. -}
keyValue :: SHASize -> KeySource -> Annex (Maybe Key)
keyValue size source = do
keyValue shasize source = do
let file = contentLocation source
s <- shaN size file
stat <- liftIO $ getFileStatus file
let filesize = fromIntegral $ fileSize stat
s <- shaN shasize file filesize
return $ Just $ stubKey
{ keyName = s
, keyBackendName = shaName size
, keySize = Just $ fromIntegral $ fileSize stat
, keyBackendName = shaName shasize
, keySize = Just filesize
}
{- Extension preserving keys. -}
@ -101,10 +112,12 @@ keyValueE size source = keyValue size source >>= maybe (return Nothing) addE
checkKeyChecksum :: SHASize -> Key -> FilePath -> Annex Bool
checkKeyChecksum size key file = do
fast <- Annex.getState Annex.fast
present <- liftIO $ doesFileExist file
if not present || fast
then return True
else check <$> shaN size file
mstat <- liftIO $ catchMaybeIO $ getFileStatus file
case (mstat, fast) of
(Just stat, False) -> do
let filesize = fromIntegral $ fileSize stat
check <$> shaN size file filesize
_ -> return True
where
check s
| s == dropExtension (keyName key) = True

View file

@ -28,15 +28,14 @@ tests =
, TestCase "gpg" $ testCmd "gpg" "gpg --version >/dev/null"
, TestCase "lsof" $ testCmd "lsof" "lsof -v >/dev/null 2>&1"
, TestCase "ssh connection caching" getSshConnectionCaching
] ++ shaTestCases False [1, 512, 224, 384] ++ shaTestCases True [256]
] ++ shaTestCases [1, 256, 512, 224, 384]
shaTestCases :: Bool -> [Int] -> [TestCase]
shaTestCases required l = map make l
shaTestCases :: [Int] -> [TestCase]
shaTestCases l = map make l
where
make n = TestCase key $ selector key (shacmds n) "</dev/null"
make n = TestCase key $ maybeSelectCmd key (shacmds n) "</dev/null"
where
key = "sha" ++ show n
selector = if required then selectCmd else maybeSelectCmd
shacmds n = concatMap (\x -> [x, osxpath </> x]) $
map (\x -> "sha" ++ show n ++ x) ["", "sum"]
-- Max OSX puts GNU tools outside PATH, so look in

5
debian/changelog vendored
View file

@ -4,6 +4,11 @@ git-annex (3.20120630) UNRELEASED; urgency=low
transfer is already in progress by another process.
* status: Lists transfers that are currently in progress.
* Fix passing --uuid to git-annex-shell.
* When shaNsum commands cannot be found, use the Haskell SHA library
(already a dependency) to do the checksumming. This may be slower,
but avoids portability problems.
* Use SHA library for files less than 50 kb in size, at which point it's
faster than forking the more optimised external program.
-- Joey Hess <joeyh@debian.org> Sun, 01 Jul 2012 15:04:37 -0400

View file

@ -0,0 +1,19 @@
[[!comment format=mdwn
username="http://joeyh.name/"
subject="comment 3"
date="2012-07-04T12:32:44Z"
content="""
Jimmy, sounds like I could use something like this to get the current limit:
sysctl kern.maxfilesperproc
Probably prints \"sysctl kern.maxfilesperproc = 256\" or such.. can you verify?
Once I have the limit, I can make the kqueue code use subset of it, and print out a message when it needs to be increased, like the inotify code does.
(Also, the kqueue code only opens directories, not files, so unless you have 400000 directories, that's
a little high.)
---
On file removal not propigating, does this still happen? When you remove a file does a git commit automatically happen, or is that broken with kqueue?
"""]]

View file

@ -0,0 +1,14 @@
We need a way to calculate space taken by certain files.
Use cases: I want to drop some files from my small disk. I need to figure out things that take most space, and drop them.
Usage examples:
git annex du -hs *.mp3
git annex du -sBm --in=here *.ogg
Would be nice if it was compatible with standard unix `df`.
> `du -L` works.
>
> See also: [[forum/Wishlist:_getting_the_disk_used_by_a_subtree_of_files]]

View file

@ -0,0 +1,34 @@
Well, sometimes you just have to go for the hack. Trying to find a way
to add additional options to git-annex-shell without breaking backwards
compatability, I noticed that it ignores all options after `--`, because
those tend to be random rsync options due to the way rsync runs it.
So, I've added a new class of options, that come in between, like
`-- opt=val opt=val ... --`
The parser for these will not choke on unknown options, unlike normal
getopt. So this let me add the additional info I needed to
pass to git-annex-shell to make it record transfer information. And
if I need to pass more info in the future, that's covered too.
It's ugly, but since only git-annex runs git-annex-shell, this is an
ugliness only I (and now you, dear reader) have to put up with.
Note to self: Command-line programs are sometimes an API, particularly
if designed to be called remotely, and so it makes sense consider
whether they are, and design expandability into them from day 1.
---
Anyway, we now have full transfer tracking in git-annex! Both sides of
a transfer know what's being transferred, and from where, and have
the info necessary to interrupt the transfer.
---
Also did some basic groundwork, adding a queue of transfers to perform,
and adding to the daemon's status information a map of currently running
transfers.
Next up: The daemon will use inotify to notice new and deleted transfer
info files, and update its status info.

View file

@ -0,0 +1,25 @@
Starting to travel, so limited time today.
Yet Another Thread added to the assistant, all it does is watch for changes
to transfer information files, and update the assistant's map of transfers
currently in progress. Now the assistant will know if some other repository
has connected to the local repo and is sending or receiving a file's
content.
This seemed really simple to write, it's just 78 lines of code. It worked
100% correctly the first time. :) But it's only so easy because I've got
this shiny new inotify hammer that I keep finding places to use in the
assistant.
Also, the new thread does some things that caused a similar thread (the
merger thread) to go into a MVar deadlock. Luckily, I spent much of
[day 19](day_19__random_improvements) investigating and fixing that
deadlock, even though it was not a problem at the time.
So, good.. I'm doing things right and getting to a place where rather
nontrivial features can be added easily.
--
Next up: Enough nonsense with tracking tranfers... Time to start actually
transferring content around!

View file

@ -0,0 +1,99 @@
In a series of airport layovers all day. Since I woke up at 3:45 am,
didn't feel up to doing serious new work, so instead I worked through some
OSX support backlog.
git-annex will now use Haskell's SHA library if the `sha256sum`
command is not available. That library is slow, but it's guaranteed to be
available; git-annex already depended on it to calculate HMACs.
Then I decided to see if it makes sense to use the SHA library
when adding smaller files. At some point, its slower implementation should
win over needing to fork and parse the output of `sha256sum`. This was
the first time I tried out Haskell's
[Criterion](http://hackage.haskell.org/package/criterion) benchmarker,
and I built this simple benchmark in short order.
[[!format haskell """
import Data.Digest.Pure.SHA
import Data.ByteString.Lazy as L
import Criterion.Main
import Common
testfile :: FilePath
testfile = "/tmp/bar" -- on ram disk
main = defaultMain
[ bgroup "sha256"
[ bench "internal" $ whnfIO internal
, bench "external" $ whnfIO external
]
]
internal :: IO String
internal = showDigest . sha256 <$> L.readFile testfile
external :: IO String
external = pOpen ReadFromPipe "sha256sum" [testfile] $ \h ->
fst . separate (== ' ') <$> hGetLine h
"""]]
The nice thing about benchmarking in Airports is when you're running a
benchmark locally, you don't want to do anything else with the computer,
so can alternate people watching, spacing out, and analizing results.
100 kb file:
benchmarking sha256/internal
mean: 15.64729 ms, lb 15.29590 ms, ub 16.10119 ms, ci 0.950
std dev: 2.032476 ms, lb 1.638016 ms, ub 2.527089 ms, ci 0.950
benchmarking sha256/external
mean: 8.217700 ms, lb 7.931324 ms, ub 8.568805 ms, ci 0.950
std dev: 1.614786 ms, lb 1.357791 ms, ub 2.009682 ms, ci 0.950
75 kb file:
benchmarking sha256/internal
mean: 12.16099 ms, lb 11.89566 ms, ub 12.50317 ms, ci 0.950
std dev: 1.531108 ms, lb 1.232353 ms, ub 1.929141 ms, ci 0.950
benchmarking sha256/external
mean: 8.818731 ms, lb 8.425744 ms, ub 9.269550 ms, ci 0.950
std dev: 2.158530 ms, lb 1.916067 ms, ub 2.487242 ms, ci 0.950
50 kb file:
benchmarking sha256/internal
mean: 7.699274 ms, lb 7.560254 ms, ub 7.876605 ms, ci 0.950
std dev: 801.5292 us, lb 655.3344 us, ub 990.4117 us, ci 0.950
benchmarking sha256/external
mean: 8.715779 ms, lb 8.330540 ms, ub 9.102232 ms, ci 0.950
std dev: 1.988089 ms, lb 1.821582 ms, ub 2.181676 ms, ci 0.950
10 kb file:
benchmarking sha256/internal
mean: 1.586105 ms, lb 1.574512 ms, ub 1.604922 ms, ci 0.950
std dev: 74.07235 us, lb 51.71688 us, ub 108.1348 us, ci 0.950
benchmarking sha256/external
mean: 6.873742 ms, lb 6.582765 ms, ub 7.252911 ms, ci 0.950
std dev: 1.689662 ms, lb 1.346310 ms, ub 2.640399 ms, ci 0.950
It's possible to get nice graphical reports out of Criterion, but
this is clear enough, so I stopped here. 50 kb seems a reasonable
cutoff point.
I also used this to benchmark the SHA256 in Haskell's Crypto package.
Surprisingly, it's a *lot* slower than even the Pure.SHA code.
On a 50 kb file:
benchmarking sha256/Crypto
collecting 100 samples, 1 iterations each, in estimated 6.073809 s
mean: 69.89037 ms, lb 69.15831 ms, ub 70.71845 ms, ci 0.950
std dev: 3.995397 ms, lb 3.435775 ms, ub 4.721952 ms, ci 0.950
There's another Haskell library, [SHA2](http://hackage.haskell.org/package/SHA2),
which I should try some time.

View file

@ -10,14 +10,15 @@ all the other git clones, at both the git level and the key/value level.
* transfer info for git-annex-shell **done**
* update files as transfers proceed. See [[progressbars]]
(updating for downloads is easy; for uploads is hard)
* add Transfer queue TChan
* add Transfer queue TChan **done**
* add TransferInfo Map to DaemonStatus for tracking transfers in progress.
**done**
* Poll transfer in progress info files for changes (use inotify again!
wow! hammer, meet nail..), and update the TransferInfo Map **done**
* enqueue Transfers (Uploads) as new files are added to the annex by
Watcher.
* enqueue Tranferrs (Downloads) as new dangling symlinks are noticed by
Watcher.
* add TransferInfo Map to DaemonStatus for tracking transfers in progress.
* Poll transfer in progress info files for changes (use inotify again!
wow! hammer, meet nail..), and update the TransferInfo Map
* Write basic Transfer handling thread. Multiple such threads need to be
able to be run at once. Each will need its own independant copy of the
Annex state monad.
@ -52,6 +53,9 @@ all the other git clones, at both the git level and the key/value level.
signaling a change out of band.
4. Add a hook, so when there's a change to sync, a program can be run
and do its own signaling.
5. --debug will show often unnecessary work being done. Optimise.
6. It would be nice if, when a USB drive is connected,
syncing starts automatically. Use dbus on Linux?
## misc todo
@ -89,13 +93,12 @@ anyway.
that lack content.
* Transfer threads started/stopped as necessary to move data.
(May sometimes want multiple threads downloading, or uploading, or even both.)
type TransferQueue = TChan [Transfer]
-- add (M.Map Transfer TransferInfo) to DaemonStatus
startTransfer :: Transfer -> Annex TransferID
startTransfer :: TransferQueue -> Transfer -> Annex ()
startTransfer q transfer = error "TODO"
stopTransfer :: TransferID -> IO ()
stopTransfer :: TransferQueue -> TransferID -> Annex ()
stopTransfer q transfer = error "TODO"
The assistant needs to find out when `git-annex-shell` is receiving or
sending (triggered by another remote), so it can add data for those too.

View file

@ -0,0 +1,9 @@
[[!comment format=mdwn
username="http://joeyh.name/"
subject="comment 14"
date="2012-07-04T12:43:54Z"
content="""
@Damien, hmm, it should not be using any cp options, unless when it was built there was a cp in the path that supported some option like -p. Can you check with --debug what cp parameters it's trying to use?
"""]]

View file

@ -0,0 +1,9 @@
[[!comment format=mdwn
username="http://joeyh.name/"
subject="comment 15"
date="2012-07-04T13:14:00Z"
content="""
git-annex will now fall back to slower pure Haskell hashing code if `sha256sum`, etc programs are not in PATH. I'd still recommend installing the coreutils, as they're probably faster.
(The `shasum` command seems to come from a perl library, so I have not tried to make git-annex use that one.)
"""]]