From c565340adc1e98886ce8689814f832561c8418a1 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Tue, 28 Aug 2018 18:01:13 -0400 Subject: [PATCH] stop using external hash programs, since cryptonite is faster In 2013, I wrote "Cryptohash benchmarks 90 to 101% faster than external hashers". Re-benchmarking today, I found cryptonite's sha256 consistently outperformed coreutils by 10% for large files. Tested 10 mb, 100 mb, 1 gb files with both sha256 and sha512. And for smaller files, the external process startup time swamps the hash time. Perhaps cryptonite has improved. Or it could just do better on my current CPU Intel(R) Pentium(R) CPU 4410Y @ 1.50GHz). Anyway, even if cryptonite is slower in some situations, seems likely it would only be marginally slower; it's got the same class of highly optimised C code under the hood as coreutils. The main difference between the two sha256 implementations seems to be how much of the inner loop they unroll.. This commit was sponsored by Henrik Riomar on Patreon. --- Backend/Hash.hs | 83 +++++++++++++++------------------------- Build/BundledPrograms.hs | 7 +--- Build/Configure.hs | 33 ---------------- CHANGELOG | 1 + Utility/ExternalSHA.hs | 67 -------------------------------- git-annex.cabal | 1 - 6 files changed, 33 insertions(+), 159 deletions(-) delete mode 100644 Utility/ExternalSHA.hs diff --git a/Backend/Hash.hs b/Backend/Hash.hs index e4845b7725..f9dddaaa11 100644 --- a/Backend/Hash.hs +++ b/Backend/Hash.hs @@ -18,9 +18,7 @@ import Types.Key import Types.Backend import Types.KeySource import Utility.Hash -import Utility.ExternalSHA -import qualified BuildInfo import qualified Data.ByteString.Lazy as L import Data.Char @@ -89,7 +87,7 @@ keyValue :: Hash -> KeySource -> Annex (Maybe Key) keyValue hash source = do let file = contentLocation source filesize <- liftIO $ getFileSize file - s <- hashFile hash file filesize + s <- hashFile hash file return $ Just $ stubKey { keyName = s , keyVariety = hashKeyVariety hash (HasExt False) @@ -116,16 +114,16 @@ selectExtension f reverse $ splitc '.' $ takeExtensions f shortenough e = length e <= 4 -- long enough for "jpeg" -{- A key's checksum is checked during fsck. -} +{- A key's checksum is checked during fsck when it's content is present + - except for in fast mode. -} checkKeyChecksum :: Hash -> Key -> FilePath -> Annex Bool checkKeyChecksum hash key file = catchIOErrorType HardwareFault hwfault $ do fast <- Annex.getState Annex.fast - mstat <- liftIO $ catchMaybeIO $ getFileStatus file - case (mstat, fast) of - (Just stat, False) -> do - filesize <- liftIO $ getFileSize' file stat + exists <- liftIO $ doesFileExist file + case (exists, fast) of + (True, False) -> do showAction "checksum" - check <$> hashFile hash file filesize + check <$> hashFile hash file _ -> return True where expected = keyHash key @@ -192,55 +190,33 @@ trivialMigrate oldkey newbackend afile oldvariety = keyVariety oldkey newvariety = backendVariety newbackend -hashFile :: Hash -> FilePath -> Integer -> Annex String -hashFile hash file filesize = go hash +hashFile :: Hash -> FilePath -> Annex String +hashFile hash file = liftIO $ do + h <- hasher <$> L.readFile file + -- Force full evaluation so file is read and closed. + return (length h `seq` h) where - go MD5Hash = use md5Hasher - go SHA1Hash = usehasher (HashSize 1) - go (SHA2Hash hashsize) = usehasher hashsize - go (SHA3Hash hashsize) = use (sha3Hasher hashsize) - go (SkeinHash hashsize) = use (skeinHasher hashsize) + hasher = case hash of + MD5Hash -> md5Hasher + SHA1Hash -> sha1Hasher + SHA2Hash hashsize -> sha2Hasher hashsize + SHA3Hash hashsize -> sha3Hasher hashsize + SkeinHash hashsize -> skeinHasher hashsize #if MIN_VERSION_cryptonite(0,23,0) - go (Blake2bHash hashsize) = use (blake2bHasher hashsize) - go (Blake2sHash hashsize) = use (blake2sHasher hashsize) - go (Blake2spHash hashsize) = use (blake2spHasher hashsize) + Blake2bHash hashsize -> blake2bHasher hashsize + Blake2sHash hashsize -> blake2sHasher hashsize + Blake2spHash hashsize -> blake2spHasher hashsize #endif - - use hasher = liftIO $ do - h <- hasher <$> L.readFile file - -- Force full evaluation so file is read and closed. - return (length h `seq` h) - - usehasher hashsize@(HashSize sz) = case shaHasher hashsize filesize of - Left sha -> use sha - Right (external, internal) -> - liftIO (externalSHA external sz file) >>= \case - Right r -> return r - Left e -> do - warning e - -- fall back to internal since - -- external command failed - use internal -shaHasher :: HashSize -> Integer -> Either (L.ByteString -> String) (String, L.ByteString -> String) -shaHasher (HashSize hashsize) filesize - | hashsize == 1 = use BuildInfo.sha1 sha1 - | hashsize == 256 = use BuildInfo.sha256 sha2_256 - | hashsize == 224 = use BuildInfo.sha224 sha2_224 - | hashsize == 384 = use BuildInfo.sha384 sha2_384 - | hashsize == 512 = use BuildInfo.sha512 sha2_512 +sha2Hasher :: HashSize -> (L.ByteString -> String) +sha2Hasher (HashSize hashsize) + | hashsize == 256 = use sha2_256 + | hashsize == 224 = use sha2_224 + | hashsize == 384 = use sha2_384 + | hashsize == 512 = use sha2_512 | otherwise = error $ "unsupported SHA size " ++ show hashsize where - use Nothing hasher = Left $ usehasher hasher - use (Just c) hasher - {- Use builtin, but slightly slower hashing for - - smallish files. Cryptohash benchmarks 90 to 101% - - faster than external hashers, depending on the hash - - and system. So there is no point forking an external - - process unless the file is large. -} - | filesize < 1048576 = Left $ usehasher hasher - | otherwise = Right (c, usehasher hasher) - usehasher hasher = show . hasher + use hasher = show . hasher sha3Hasher :: HashSize -> (L.ByteString -> String) sha3Hasher (HashSize hashsize) @@ -280,6 +256,9 @@ blake2spHasher (HashSize hashsize) | otherwise = error $ "unsupported BLAKE2SP size " ++ show hashsize #endif +sha1Hasher :: L.ByteString -> String +sha1Hasher = show . sha1 + md5Hasher :: L.ByteString -> String md5Hasher = show . md5 diff --git a/Build/BundledPrograms.hs b/Build/BundledPrograms.hs index cf16da0f2b..f593c98103 100644 --- a/Build/BundledPrograms.hs +++ b/Build/BundledPrograms.hs @@ -75,13 +75,8 @@ preferredBundledPrograms = catMaybes , BuildInfo.lsof , BuildInfo.gcrypt #ifndef mingw32_HOST_OS - -- All these utilities are included in git for Windows + -- These utilities are included in git for Windows , ifset BuildInfo.curl "curl" - , BuildInfo.sha1 - , BuildInfo.sha256 - , BuildInfo.sha512 - , BuildInfo.sha224 - , BuildInfo.sha384 , Just "cp" #endif #ifdef linux_HOST_OS diff --git a/Build/Configure.hs b/Build/Configure.hs index c490148f50..6157921b58 100644 --- a/Build/Configure.hs +++ b/Build/Configure.hs @@ -7,7 +7,6 @@ module Build.Configure where import Build.TestConfig import Build.Version import Utility.SafeCommand -import Utility.ExternalSHA import Utility.Env.Basic import qualified Git.Version import Utility.Directory @@ -38,38 +37,8 @@ tests = , TestCase "lsof" $ findCmdPath "lsof" "lsof" , TestCase "git-remote-gcrypt" $ findCmdPath "gcrypt" "git-remote-gcrypt" , TestCase "ssh connection caching" getSshConnectionCaching - ] ++ shaTestCases - [ (1, "da39a3ee5e6b4b0d3255bfef95601890afd80709") - , (256, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") - , (512, "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e") - , (224, "d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f") - , (384, "38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b") ] -{- shaNsum are the program names used by coreutils. Some systems - - install these with 'g' prefixes. - - - - On some systems, shaN is used instead, but on other - - systems, it might be "hashalot", which does not produce - - usable checksums. Only accept programs that produce - - known-good hashes when run on files. -} -shaTestCases :: [(Int, String)] -> [TestCase] -shaTestCases l = map make l - where - make (n, knowngood) = TestCase key $ - Config key . MaybeStringConfig <$> search (shacmds n) - where - key = "sha" ++ show n - search [] = return Nothing - search (c:cmds) = do - sha <- externalSHA c n "/dev/null" - if sha == Right knowngood - then return $ Just c - else search cmds - - shacmds n = concatMap (\x -> [x, 'g':x]) $ - map (\x -> "sha" ++ show n ++ x) ["sum", ""] - tmpDir :: String tmpDir = "tmp" @@ -128,8 +97,6 @@ androidConfig c = overrides ++ filter (not . overridden) c overrides = [ Config "cp_reflink_auto" $ BoolConfig False , Config "curl" $ BoolConfig False - , Config "sha224" $ MaybeStringConfig Nothing - , Config "sha384" $ MaybeStringConfig Nothing ] overridden (Config k _) = k `elem` overridekeys overridekeys = map (\(Config k _) -> k) overrides diff --git a/CHANGELOG b/CHANGELOG index 7963c2ee6c..502eb25c1c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -23,6 +23,7 @@ git-annex (6.20180808) UNRELEASED; urgency=medium * linux standalone: When LOCPATH is already set, use it instead of the bundled locales. It can be set to an empty string to use the system locales too. + * Stop using external hash programs, since cryptonite is faster. -- Joey Hess Wed, 08 Aug 2018 11:24:08 -0400 diff --git a/Utility/ExternalSHA.hs b/Utility/ExternalSHA.hs deleted file mode 100644 index 7b08820040..0000000000 --- a/Utility/ExternalSHA.hs +++ /dev/null @@ -1,67 +0,0 @@ -{- Calculating a SHA checksum with an external command. - - - - This is typically a bit faster than using Haskell libraries, - - by around 1% to 10%. Worth it for really big files. - - - - Copyright 2011-2013 Joey Hess - - - - License: BSD-2-clause - -} - -{-# OPTIONS_GHC -fno-warn-tabs #-} - -module Utility.ExternalSHA (externalSHA) where - -import Utility.SafeCommand -import Utility.Process -import Utility.Misc -import Utility.Exception - -import Data.List -import Data.Char -import System.IO - -externalSHA :: String -> Int -> FilePath -> IO (Either String String) -externalSHA command shasize file = do - v <- tryNonAsync $ readsha $ toCommand [File file] - return $ case v of - Right s -> sanitycheck =<< parse (lines s) - Left _ -> Left (command ++ " failed") - where - readsha args = withHandle StdoutHandle createProcessSuccess p $ \h -> do - output <- hGetContentsStrict h - hClose h - return output - where - p = (proc command args) { std_out = CreatePipe } - - {- The first word of the output is taken to be the sha. -} - parse [] = bad - parse (l:_) - | null sha = bad - -- sha is prefixed with \ when filename contains certian chars - | "\\" `isPrefixOf` sha = Right $ drop 1 sha - | otherwise = Right sha - where - sha = fst $ separate (== ' ') l - bad = Left $ command ++ " parse error" - - {- Check that we've correctly parsing the output of the command, - - by making sure the sha we read is of the expected length - - and contains only the right characters. -} - sanitycheck sha - | length sha /= expectedSHALength shasize = - Left $ "Failed to parse the output of " ++ command - | any (`notElem` "0123456789abcdef") sha' = - Left $ "Unexpected character in output of " ++ command ++ "\"" ++ sha ++ "\"" - | otherwise = Right sha' - where - sha' = map toLower sha - -expectedSHALength :: Int -> Int -expectedSHALength 1 = 40 -expectedSHALength 256 = 64 -expectedSHALength 512 = 128 -expectedSHALength 224 = 56 -expectedSHALength 384 = 96 -expectedSHALength _ = 0 diff --git a/git-annex.cabal b/git-annex.cabal index 7e1af56a89..16bd3195b2 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -1021,7 +1021,6 @@ Executable git-annex Utility.Env.Basic Utility.Env.Set Utility.Exception - Utility.ExternalSHA Utility.FileMode Utility.FileSize Utility.FileSystemEncoding