stop using external hash programs, since cryptonite is faster

In 2013, I wrote "Cryptohash benchmarks 90 to 101% faster than external
hashers". Re-benchmarking today, I found cryptonite's sha256 consistently
outperformed coreutils by 10% for large files. Tested 10 mb, 100 mb, 1 gb
files with both sha256 and sha512. And for smaller files, the external
process startup time swamps the hash time.

Perhaps cryptonite has improved. Or it could just do better on my
current CPU Intel(R) Pentium(R) CPU 4410Y @ 1.50GHz). Anyway, even if cryptonite
is slower in some situations, seems likely it would only be marginally slower;
it's got the same class of highly optimised C code under the hood as coreutils.
The main difference between the two sha256 implementations seems to be
how much of the inner loop they unroll..

This commit was sponsored by Henrik Riomar on Patreon.
This commit is contained in:
Joey Hess 2018-08-28 18:01:13 -04:00
parent 6001b3cf45
commit c565340adc
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
6 changed files with 33 additions and 159 deletions

View file

@ -18,9 +18,7 @@ import Types.Key
import Types.Backend import Types.Backend
import Types.KeySource import Types.KeySource
import Utility.Hash import Utility.Hash
import Utility.ExternalSHA
import qualified BuildInfo
import qualified Data.ByteString.Lazy as L import qualified Data.ByteString.Lazy as L
import Data.Char import Data.Char
@ -89,7 +87,7 @@ keyValue :: Hash -> KeySource -> Annex (Maybe Key)
keyValue hash source = do keyValue hash source = do
let file = contentLocation source let file = contentLocation source
filesize <- liftIO $ getFileSize file filesize <- liftIO $ getFileSize file
s <- hashFile hash file filesize s <- hashFile hash file
return $ Just $ stubKey return $ Just $ stubKey
{ keyName = s { keyName = s
, keyVariety = hashKeyVariety hash (HasExt False) , keyVariety = hashKeyVariety hash (HasExt False)
@ -116,16 +114,16 @@ selectExtension f
reverse $ splitc '.' $ takeExtensions f reverse $ splitc '.' $ takeExtensions f
shortenough e = length e <= 4 -- long enough for "jpeg" shortenough e = length e <= 4 -- long enough for "jpeg"
{- A key's checksum is checked during fsck. -} {- A key's checksum is checked during fsck when it's content is present
- except for in fast mode. -}
checkKeyChecksum :: Hash -> Key -> FilePath -> Annex Bool checkKeyChecksum :: Hash -> Key -> FilePath -> Annex Bool
checkKeyChecksum hash key file = catchIOErrorType HardwareFault hwfault $ do checkKeyChecksum hash key file = catchIOErrorType HardwareFault hwfault $ do
fast <- Annex.getState Annex.fast fast <- Annex.getState Annex.fast
mstat <- liftIO $ catchMaybeIO $ getFileStatus file exists <- liftIO $ doesFileExist file
case (mstat, fast) of case (exists, fast) of
(Just stat, False) -> do (True, False) -> do
filesize <- liftIO $ getFileSize' file stat
showAction "checksum" showAction "checksum"
check <$> hashFile hash file filesize check <$> hashFile hash file
_ -> return True _ -> return True
where where
expected = keyHash key expected = keyHash key
@ -192,55 +190,33 @@ trivialMigrate oldkey newbackend afile
oldvariety = keyVariety oldkey oldvariety = keyVariety oldkey
newvariety = backendVariety newbackend newvariety = backendVariety newbackend
hashFile :: Hash -> FilePath -> Integer -> Annex String hashFile :: Hash -> FilePath -> Annex String
hashFile hash file filesize = go hash hashFile hash file = liftIO $ do
h <- hasher <$> L.readFile file
-- Force full evaluation so file is read and closed.
return (length h `seq` h)
where where
go MD5Hash = use md5Hasher hasher = case hash of
go SHA1Hash = usehasher (HashSize 1) MD5Hash -> md5Hasher
go (SHA2Hash hashsize) = usehasher hashsize SHA1Hash -> sha1Hasher
go (SHA3Hash hashsize) = use (sha3Hasher hashsize) SHA2Hash hashsize -> sha2Hasher hashsize
go (SkeinHash hashsize) = use (skeinHasher hashsize) SHA3Hash hashsize -> sha3Hasher hashsize
SkeinHash hashsize -> skeinHasher hashsize
#if MIN_VERSION_cryptonite(0,23,0) #if MIN_VERSION_cryptonite(0,23,0)
go (Blake2bHash hashsize) = use (blake2bHasher hashsize) Blake2bHash hashsize -> blake2bHasher hashsize
go (Blake2sHash hashsize) = use (blake2sHasher hashsize) Blake2sHash hashsize -> blake2sHasher hashsize
go (Blake2spHash hashsize) = use (blake2spHasher hashsize) Blake2spHash hashsize -> blake2spHasher hashsize
#endif #endif
use hasher = liftIO $ do
h <- hasher <$> L.readFile file
-- Force full evaluation so file is read and closed.
return (length h `seq` h)
usehasher hashsize@(HashSize sz) = case shaHasher hashsize filesize of
Left sha -> use sha
Right (external, internal) ->
liftIO (externalSHA external sz file) >>= \case
Right r -> return r
Left e -> do
warning e
-- fall back to internal since
-- external command failed
use internal
shaHasher :: HashSize -> Integer -> Either (L.ByteString -> String) (String, L.ByteString -> String) sha2Hasher :: HashSize -> (L.ByteString -> String)
shaHasher (HashSize hashsize) filesize sha2Hasher (HashSize hashsize)
| hashsize == 1 = use BuildInfo.sha1 sha1 | hashsize == 256 = use sha2_256
| hashsize == 256 = use BuildInfo.sha256 sha2_256 | hashsize == 224 = use sha2_224
| hashsize == 224 = use BuildInfo.sha224 sha2_224 | hashsize == 384 = use sha2_384
| hashsize == 384 = use BuildInfo.sha384 sha2_384 | hashsize == 512 = use sha2_512
| hashsize == 512 = use BuildInfo.sha512 sha2_512
| otherwise = error $ "unsupported SHA size " ++ show hashsize | otherwise = error $ "unsupported SHA size " ++ show hashsize
where where
use Nothing hasher = Left $ usehasher hasher use hasher = show . hasher
use (Just c) hasher
{- Use builtin, but slightly slower hashing for
- smallish files. Cryptohash benchmarks 90 to 101%
- faster than external hashers, depending on the hash
- and system. So there is no point forking an external
- process unless the file is large. -}
| filesize < 1048576 = Left $ usehasher hasher
| otherwise = Right (c, usehasher hasher)
usehasher hasher = show . hasher
sha3Hasher :: HashSize -> (L.ByteString -> String) sha3Hasher :: HashSize -> (L.ByteString -> String)
sha3Hasher (HashSize hashsize) sha3Hasher (HashSize hashsize)
@ -280,6 +256,9 @@ blake2spHasher (HashSize hashsize)
| otherwise = error $ "unsupported BLAKE2SP size " ++ show hashsize | otherwise = error $ "unsupported BLAKE2SP size " ++ show hashsize
#endif #endif
sha1Hasher :: L.ByteString -> String
sha1Hasher = show . sha1
md5Hasher :: L.ByteString -> String md5Hasher :: L.ByteString -> String
md5Hasher = show . md5 md5Hasher = show . md5

View file

@ -75,13 +75,8 @@ preferredBundledPrograms = catMaybes
, BuildInfo.lsof , BuildInfo.lsof
, BuildInfo.gcrypt , BuildInfo.gcrypt
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS
-- All these utilities are included in git for Windows -- These utilities are included in git for Windows
, ifset BuildInfo.curl "curl" , ifset BuildInfo.curl "curl"
, BuildInfo.sha1
, BuildInfo.sha256
, BuildInfo.sha512
, BuildInfo.sha224
, BuildInfo.sha384
, Just "cp" , Just "cp"
#endif #endif
#ifdef linux_HOST_OS #ifdef linux_HOST_OS

View file

@ -7,7 +7,6 @@ module Build.Configure where
import Build.TestConfig import Build.TestConfig
import Build.Version import Build.Version
import Utility.SafeCommand import Utility.SafeCommand
import Utility.ExternalSHA
import Utility.Env.Basic import Utility.Env.Basic
import qualified Git.Version import qualified Git.Version
import Utility.Directory import Utility.Directory
@ -38,38 +37,8 @@ tests =
, TestCase "lsof" $ findCmdPath "lsof" "lsof" , TestCase "lsof" $ findCmdPath "lsof" "lsof"
, TestCase "git-remote-gcrypt" $ findCmdPath "gcrypt" "git-remote-gcrypt" , TestCase "git-remote-gcrypt" $ findCmdPath "gcrypt" "git-remote-gcrypt"
, TestCase "ssh connection caching" getSshConnectionCaching , TestCase "ssh connection caching" getSshConnectionCaching
] ++ shaTestCases
[ (1, "da39a3ee5e6b4b0d3255bfef95601890afd80709")
, (256, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855")
, (512, "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e")
, (224, "d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f")
, (384, "38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b")
] ]
{- shaNsum are the program names used by coreutils. Some systems
- install these with 'g' prefixes.
-
- On some systems, shaN is used instead, but on other
- systems, it might be "hashalot", which does not produce
- usable checksums. Only accept programs that produce
- known-good hashes when run on files. -}
shaTestCases :: [(Int, String)] -> [TestCase]
shaTestCases l = map make l
where
make (n, knowngood) = TestCase key $
Config key . MaybeStringConfig <$> search (shacmds n)
where
key = "sha" ++ show n
search [] = return Nothing
search (c:cmds) = do
sha <- externalSHA c n "/dev/null"
if sha == Right knowngood
then return $ Just c
else search cmds
shacmds n = concatMap (\x -> [x, 'g':x]) $
map (\x -> "sha" ++ show n ++ x) ["sum", ""]
tmpDir :: String tmpDir :: String
tmpDir = "tmp" tmpDir = "tmp"
@ -128,8 +97,6 @@ androidConfig c = overrides ++ filter (not . overridden) c
overrides = overrides =
[ Config "cp_reflink_auto" $ BoolConfig False [ Config "cp_reflink_auto" $ BoolConfig False
, Config "curl" $ BoolConfig False , Config "curl" $ BoolConfig False
, Config "sha224" $ MaybeStringConfig Nothing
, Config "sha384" $ MaybeStringConfig Nothing
] ]
overridden (Config k _) = k `elem` overridekeys overridden (Config k _) = k `elem` overridekeys
overridekeys = map (\(Config k _) -> k) overrides overridekeys = map (\(Config k _) -> k) overrides

View file

@ -23,6 +23,7 @@ git-annex (6.20180808) UNRELEASED; urgency=medium
* linux standalone: When LOCPATH is already set, use it instead of the * linux standalone: When LOCPATH is already set, use it instead of the
bundled locales. It can be set to an empty string to use the system bundled locales. It can be set to an empty string to use the system
locales too. locales too.
* Stop using external hash programs, since cryptonite is faster.
-- Joey Hess <id@joeyh.name> Wed, 08 Aug 2018 11:24:08 -0400 -- Joey Hess <id@joeyh.name> Wed, 08 Aug 2018 11:24:08 -0400

View file

@ -1,67 +0,0 @@
{- Calculating a SHA checksum with an external command.
-
- This is typically a bit faster than using Haskell libraries,
- by around 1% to 10%. Worth it for really big files.
-
- Copyright 2011-2013 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
{-# OPTIONS_GHC -fno-warn-tabs #-}
module Utility.ExternalSHA (externalSHA) where
import Utility.SafeCommand
import Utility.Process
import Utility.Misc
import Utility.Exception
import Data.List
import Data.Char
import System.IO
externalSHA :: String -> Int -> FilePath -> IO (Either String String)
externalSHA command shasize file = do
v <- tryNonAsync $ readsha $ toCommand [File file]
return $ case v of
Right s -> sanitycheck =<< parse (lines s)
Left _ -> Left (command ++ " failed")
where
readsha args = withHandle StdoutHandle createProcessSuccess p $ \h -> do
output <- hGetContentsStrict h
hClose h
return output
where
p = (proc command args) { std_out = CreatePipe }
{- The first word of the output is taken to be the sha. -}
parse [] = bad
parse (l:_)
| null sha = bad
-- sha is prefixed with \ when filename contains certian chars
| "\\" `isPrefixOf` sha = Right $ drop 1 sha
| otherwise = Right sha
where
sha = fst $ separate (== ' ') l
bad = Left $ command ++ " parse error"
{- Check that we've correctly parsing the output of the command,
- by making sure the sha we read is of the expected length
- and contains only the right characters. -}
sanitycheck sha
| length sha /= expectedSHALength shasize =
Left $ "Failed to parse the output of " ++ command
| any (`notElem` "0123456789abcdef") sha' =
Left $ "Unexpected character in output of " ++ command ++ "\"" ++ sha ++ "\""
| otherwise = Right sha'
where
sha' = map toLower sha
expectedSHALength :: Int -> Int
expectedSHALength 1 = 40
expectedSHALength 256 = 64
expectedSHALength 512 = 128
expectedSHALength 224 = 56
expectedSHALength 384 = 96
expectedSHALength _ = 0

View file

@ -1021,7 +1021,6 @@ Executable git-annex
Utility.Env.Basic Utility.Env.Basic
Utility.Env.Set Utility.Env.Set
Utility.Exception Utility.Exception
Utility.ExternalSHA
Utility.FileMode Utility.FileMode
Utility.FileSize Utility.FileSize
Utility.FileSystemEncoding Utility.FileSystemEncoding