Use cryptohash rather than SHA for hashing.

This is a massive win on OSX, which doesn't have a sha256sum normally.

Only use external hash commands when the file is > 1 mb,
since cryptohash is quite close to them in speed.

SHA is still used to calculate HMACs. I don't quite understand
cryptohash's API for those.

Used the following benchmark to arrive at the 1 mb number.

1 mb file:

benchmarking sha256/internal
mean: 13.86696 ms, lb 13.83010 ms, ub 13.93453 ms, ci 0.950
std dev: 249.3235 us, lb 162.0448 us, ub 458.1744 us, ci 0.950
found 5 outliers among 100 samples (5.0%)
  4 (4.0%) high mild
  1 (1.0%) high severe
variance introduced by outliers: 10.415%
variance is moderately inflated by outliers

benchmarking sha256/external
mean: 14.20670 ms, lb 14.17237 ms, ub 14.27004 ms, ci 0.950
std dev: 230.5448 us, lb 150.7310 us, ub 427.6068 us, ci 0.950
found 3 outliers among 100 samples (3.0%)
  2 (2.0%) high mild
  1 (1.0%) high severe

2 mb file:

benchmarking sha256/internal
mean: 26.44270 ms, lb 26.23701 ms, ub 26.63414 ms, ci 0.950
std dev: 1.012303 ms, lb 925.8921 us, ub 1.122267 ms, ci 0.950
variance introduced by outliers: 35.540%
variance is moderately inflated by outliers

benchmarking sha256/external
mean: 26.84521 ms, lb 26.77644 ms, ub 26.91433 ms, ci 0.950
std dev: 347.7867 us, lb 210.6283 us, ub 571.3351 us, ci 0.950
found 6 outliers among 100 samples (6.0%)

import Crypto.Hash
import Data.ByteString.Lazy as L
import Criterion.Main
import Common

testfile :: FilePath
testfile = "/run/shm/data" -- on ram disk

main = defaultMain
        [ bgroup "sha256"
                [ bench "internal" $ whnfIO internal
                , bench "external" $ whnfIO external
                ]
        ]

sha256 :: L.ByteString -> Digest SHA256
sha256 = hashlazy

internal :: IO String
internal = show . sha256 <$> L.readFile testfile

external :: IO String
external = do
	s <- readProcess "sha256sum" [testfile]
        return $ fst $ separate (== ' ') s
This commit is contained in:
Joey Hess 2013-09-22 19:45:08 -04:00
parent 4cee7cbac6
commit 7390f08ef9
8 changed files with 57 additions and 17 deletions

View file

@ -1,6 +1,6 @@
{- git-annex SHA backends {- git-annex SHA backends
- -
- Copyright 2011,2012 Joey Hess <joey@kitenet.net> - Copyright 2011-2013 Joey Hess <joey@kitenet.net>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -12,10 +12,10 @@ import qualified Annex
import Types.Backend import Types.Backend
import Types.Key import Types.Key
import Types.KeySource import Types.KeySource
import Utility.Hash
import Utility.ExternalSHA import Utility.ExternalSHA
import qualified Build.SysConfig as SysConfig import qualified Build.SysConfig as SysConfig
import Data.Digest.Pure.SHA
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import Data.Char import Data.Char
@ -70,12 +70,14 @@ shaCommand shasize filesize
| shasize == 512 = use SysConfig.sha512 sha512 | shasize == 512 = use SysConfig.sha512 sha512
| otherwise = error $ "bad sha size " ++ show shasize | otherwise = error $ "bad sha size " ++ show shasize
where where
use Nothing sha = Left $ showDigest . sha use Nothing hasher = Left $ show . hasher
use (Just c) sha use (Just c) hasher
{- use builtin, but slower sha for small files {- Use builtin, but slightly slower hashing for
- benchmarking indicates it's faster up to - smallish files. Cryptohash benchmarks 90 to 101%
- and slightly beyond 50 kb files -} - faster than external hashers, depending on the hash
| filesize < 51200 = use Nothing sha - and system. So there is no point forking an external
- process unless the file is large. -}
| filesize < 1048576 = use Nothing hasher
| otherwise = Right c | otherwise = Right c
{- A key is a checksum of its contents. -} {- A key is a checksum of its contents. -}

View file

@ -18,7 +18,6 @@ module Git.CatFile (
import System.IO import System.IO
import qualified Data.ByteString as S import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import Data.Digest.Pure.SHA
import Data.Char import Data.Char
import System.Process (std_out, std_err) import System.Process (std_out, std_err)
import Numeric import Numeric
@ -31,6 +30,7 @@ import Git.Command
import Git.Types import Git.Types
import Git.FilePath import Git.FilePath
import qualified Utility.CoProcess as CoProcess import qualified Utility.CoProcess as CoProcess
import Utility.Hash
data CatFileHandle = CatFileHandle CoProcess.CoProcessHandle Repo data CatFileHandle = CatFileHandle CoProcess.CoProcessHandle Repo
@ -103,7 +103,7 @@ catObjectDetails (CatFileHandle hdl repo) object = CoProcess.query hdl send rece
} }
fileEncoding h fileEncoding h
content <- L.hGetContents h content <- L.hGetContents h
let sha = (\s -> length s `seq` s) (showDigest $ sha1 content) let sha = (\s -> length s `seq` s) (show $ sha1 content)
ok <- checkSuccessProcess pid ok <- checkSuccessProcess pid
return $ if ok return $ if ok
then Just (content, Ref sha) then Just (content, Ref sha)

View file

@ -10,6 +10,7 @@ module Remote.Bup (remote) where
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M import qualified Data.Map as M
import System.Process import System.Process
import Data.ByteString.Lazy.UTF8 (fromString)
import Common.Annex import Common.Annex
import Types.Remote import Types.Remote
@ -25,8 +26,7 @@ import Remote.Helper.Ssh
import Remote.Helper.Special import Remote.Helper.Special
import Remote.Helper.Encryptable import Remote.Helper.Encryptable
import Crypto import Crypto
import Data.ByteString.Lazy.UTF8 (fromString) import Utility.Hash
import Data.Digest.Pure.SHA
import Utility.UserInfo import Utility.UserInfo
import Annex.Content import Annex.Content
import Annex.UUID import Annex.UUID
@ -277,7 +277,7 @@ bup2GitRemote r
bupRef :: Key -> String bupRef :: Key -> String
bupRef k bupRef k
| Git.Ref.legal True shown = shown | Git.Ref.legal True shown = shown
| otherwise = "git-annex-" ++ showDigest (sha256 (fromString shown)) | otherwise = "git-annex-" ++ show (sha256 (fromString shown))
where where
shown = key2file k shown = key2file k

View file

@ -1,6 +1,7 @@
{- Calculating a SHA checksum with an external command. {- Calculating a SHA checksum with an external command.
- -
- This is often faster than using Haskell libraries. - This is typically a bit faster than using Haskell libraries,
- by around 1% to 10%. Worth it for really big files.
- -
- Copyright 2011-2013 Joey Hess <joey@kitenet.net> - Copyright 2011-2013 Joey Hess <joey@kitenet.net>
- -

29
Utility/Hash.hs Normal file
View file

@ -0,0 +1,29 @@
{- Convenience wrapper around cryptohash.
-
- The resulting Digests can be shown to get a canonical hash encoding. -}
module Utility.Hash where
import Crypto.Hash
import qualified Data.ByteString.Lazy as L
sha1 :: L.ByteString -> Digest SHA1
sha1 = hashlazy
sha224 :: L.ByteString -> Digest SHA224
sha224 = hashlazy
sha256 :: L.ByteString -> Digest SHA256
sha256 = hashlazy
sha384 :: L.ByteString -> Digest SHA384
sha384 = hashlazy
sha512 :: L.ByteString -> Digest SHA512
sha512 = hashlazy
-- sha3 is not yet fully standardized
--sha3 :: L.ByteString -> Digest SHA3
--sha3 = hashlazy

View file

@ -12,6 +12,7 @@ module Utility.WebApp where
import Common import Common
import Utility.Tmp import Utility.Tmp
import Utility.FileMode import Utility.FileMode
import Utility.Hash
import qualified Yesod import qualified Yesod
import qualified Network.Wai as Wai import qualified Network.Wai as Wai
@ -24,7 +25,6 @@ import qualified Data.CaseInsensitive as CI
import Network.Socket import Network.Socket
import Control.Exception import Control.Exception
import Crypto.Random import Crypto.Random
import Data.Digest.Pure.SHA
import qualified Web.ClientSession as CS import qualified Web.ClientSession as CS
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.UTF8 as L8 import qualified Data.ByteString.Lazy.UTF8 as L8
@ -214,7 +214,7 @@ genRandomToken = do
return $ return $
case genBytes 512 g of case genBytes 512 g of
Left e -> error $ "failed to generate secret token: " ++ show e Left e -> error $ "failed to generate secret token: " ++ show e
Right (s, _) -> showDigest $ sha512 $ L.fromChunks [s] Right (s, _) -> show $ sha512 $ L.fromChunks [s]
{- A Yesod isAuthorized method, which checks the auth cgi parameter {- A Yesod isAuthorized method, which checks the auth cgi parameter
- against a token extracted from the Yesod application. - against a token extracted from the Yesod application.

8
debian/changelog vendored
View file

@ -1,3 +1,11 @@
git-annex (4.20130921) UNRELEASED; urgency=low
* Use cryptohash rather than SHA for hashing when no external hash program
is available. This is a significant speedup for SHA256 on OSX, for
example.
-- Joey Hess <joeyh@debian.org> Sun, 22 Sep 2013 19:42:29 -0400
git-annex (4.20130920) unstable; urgency=low git-annex (4.20130920) unstable; urgency=low
* webapp: Initial support for setting up encrypted removable drives. * webapp: Initial support for setting up encrypted removable drives.

View file

@ -76,7 +76,7 @@ Executable git-annex
Build-Depends: MissingH, hslogger, directory, filepath, Build-Depends: MissingH, hslogger, directory, filepath,
containers, utf8-string, network (>= 2.0), mtl (>= 2), containers, utf8-string, network (>= 2.0), mtl (>= 2),
bytestring, old-locale, time, HTTP, bytestring, old-locale, time, HTTP,
extensible-exceptions, dataenc, SHA, process, json, extensible-exceptions, dataenc, SHA, cryptohash, process, json,
base (>= 4.5 && < 4.8), monad-control, MonadCatchIO-transformers, base (>= 4.5 && < 4.8), monad-control, MonadCatchIO-transformers,
IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance, process, IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance, process,
SafeSemaphore, uuid, random, dlist, unix-compat SafeSemaphore, uuid, random, dlist, unix-compat