stop using MissingH for MD5

Cryptonite is faster and allocates less, and I want to get rid of
MissingH use.

Note that the new dependency on memory is free; it's a dependency of
cryptonite.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2017-05-15 18:10:13 -04:00
parent 44baa7b306
commit 6dd806f1ad
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
11 changed files with 98 additions and 30 deletions

View file

@ -1,6 +1,6 @@
{- git-annex file locations {- git-annex file locations
- -
- Copyright 2010-2015 Joey Hess <id@joeyh.name> - Copyright 2010-2017 Joey Hess <id@joeyh.name>
- -
- Licensed under the GNU GPL version 3 or higher. - Licensed under the GNU GPL version 3 or higher.
-} -}
@ -19,14 +19,15 @@ module Annex.DirHashes (
import Data.Bits import Data.Bits
import Data.Word import Data.Word
import Data.Hash.MD5
import Data.Default import Data.Default
import qualified Data.ByteArray
import Common import Common
import Key import Key
import Types.GitConfig import Types.GitConfig
import Types.Difference import Types.Difference
import Utility.FileSystemEncoding import Utility.FileSystemEncoding
import Utility.Hash
type Hasher = Key -> FilePath type Hasher = Key -> FilePath
@ -62,15 +63,24 @@ hashDirs :: HashLevels -> Int -> String -> FilePath
hashDirs (HashLevels 1) sz s = addTrailingPathSeparator $ take sz s hashDirs (HashLevels 1) sz s = addTrailingPathSeparator $ take sz s
hashDirs _ sz s = addTrailingPathSeparator $ take sz s </> drop sz s hashDirs _ sz s = addTrailingPathSeparator $ take sz s </> drop sz s
hashDirMixed :: HashLevels -> Hasher
hashDirMixed n k = hashDirs n 2 $ take 4 $ display_32bits_as_dir =<< [a,b,c,d]
where
ABCD (a,b,c,d) = md5 $ md5FilePath $ key2file $ nonChunkKey k
hashDirLower :: HashLevels -> Hasher hashDirLower :: HashLevels -> Hasher
hashDirLower n k = hashDirs n 3 $ take 6 $ md5s $ md5FilePath $ key2file $ nonChunkKey k hashDirLower n k = hashDirs n 3 $ take 6 $ show $ md5 $
encodeBS $ key2file $ nonChunkKey k
{- This was originally using Data.Hash.MD5 from MissingH. This new version
- is faster, but ugly as it has to replicate the 4 Word32's that produced. -}
hashDirMixed :: HashLevels -> Hasher
hashDirMixed n k = hashDirs n 2 $ take 4 $ concatMap display_32bits_as_dir $
encodeWord32 $ map fromIntegral $ Data.ByteArray.unpack $
Utility.Hash.md5 $ encodeBS $ key2file $ nonChunkKey k
where
encodeWord32 (b1:b2:b3:b4:rest) =
(shiftL b4 24 .|. shiftL b3 16 .|. shiftL b2 8 .|. b1)
: encodeWord32 rest
encodeWord32 _ = []
{- modified version of display_32bits_as_hex from Data.Hash.MD5 {- modified version of display_32bits_as_hex from Data.Hash.MD5
- in MissingH
- Copyright (C) 2001 Ian Lynagh - Copyright (C) 2001 Ian Lynagh
- License: Either BSD or GPL - License: Either BSD or GPL
-} -}

View file

@ -33,6 +33,7 @@ import Config
import Annex.Path import Annex.Path
import Utility.Env import Utility.Env
import Utility.FileSystemEncoding import Utility.FileSystemEncoding
import Utility.Hash
import Types.CleanupActions import Types.CleanupActions
import Types.Concurrency import Types.Concurrency
import Git.Env import Git.Env
@ -42,7 +43,6 @@ import Annex.Perms
import Annex.LockPool import Annex.LockPool
#endif #endif
import Data.Hash.MD5
import Control.Concurrent.STM import Control.Concurrent.STM
{- Some ssh commands are fed stdin on a pipe and so should be allowed to {- Some ssh commands are fed stdin on a pipe and so should be allowed to
@ -287,7 +287,7 @@ hostport2socket host Nothing = hostport2socket' host
hostport2socket host (Just port) = hostport2socket' $ host ++ "!" ++ show port hostport2socket host (Just port) = hostport2socket' $ host ++ "!" ++ show port
hostport2socket' :: String -> FilePath hostport2socket' :: String -> FilePath
hostport2socket' s hostport2socket' s
| length s > lengthofmd5s = md5s (Str s) | length s > lengthofmd5s = show $ md5 $ encodeBS s
| otherwise = s | otherwise = s
where where
lengthofmd5s = 32 lengthofmd5s = 32

View file

@ -9,8 +9,7 @@ module Annex.VariantFile where
import Annex.Common import Annex.Common
import Utility.FileSystemEncoding import Utility.FileSystemEncoding
import Utility.Hash
import Data.Hash.MD5
variantMarker :: String variantMarker :: String
variantMarker = ".variant-" variantMarker = ".variant-"
@ -42,4 +41,4 @@ variantFile file key
doubleconflict = variantMarker `isInfixOf` file doubleconflict = variantMarker `isInfixOf` file
shortHash :: String -> String shortHash :: String -> String
shortHash = take 4 . md5s . md5FilePath shortHash = take 4 . show . md5 . encodeBS

View file

@ -7,10 +7,9 @@
module Backend.Utilities where module Backend.Utilities where
import Data.Hash.MD5
import Annex.Common import Annex.Common
import Utility.FileSystemEncoding import Utility.FileSystemEncoding
import Utility.Hash
{- Generates a keyName from an input string. Takes care of sanitizing it. {- Generates a keyName from an input string. Takes care of sanitizing it.
- If it's not too long, the full string is used as the keyName. - If it's not too long, the full string is used as the keyName.
@ -20,7 +19,8 @@ genKeyName :: String -> String
genKeyName s genKeyName s
-- Avoid making keys longer than the length of a SHA256 checksum. -- Avoid making keys longer than the length of a SHA256 checksum.
| bytelen > sha256len = | bytelen > sha256len =
truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++ md5s (Str s) truncateFilePath (sha256len - md5len - 1) s' ++ "-" ++
show (md5 (encodeBS s))
| otherwise = s' | otherwise = s'
where where
s' = preSanitizeKeyName s s' = preSanitizeKeyName s

View file

@ -12,7 +12,6 @@ module Utility.FileSystemEncoding (
useFileSystemEncoding, useFileSystemEncoding,
fileEncoding, fileEncoding,
withFilePath, withFilePath,
md5FilePath,
decodeBS, decodeBS,
encodeBS, encodeBS,
decodeW8, decodeW8,
@ -27,7 +26,6 @@ import qualified GHC.IO.Encoding as Encoding
import Foreign.C import Foreign.C
import System.IO import System.IO
import System.IO.Unsafe import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5
import Data.Word import Data.Word
import Data.Bits.Utils import Data.Bits.Utils
import Data.List import Data.List
@ -101,10 +99,6 @@ _encodeFilePath fp = unsafePerformIO $ do
GHC.withCString enc fp (GHC.peekCString Encoding.char8) GHC.withCString enc fp (GHC.peekCString Encoding.char8)
`catchNonAsync` (\_ -> return fp) `catchNonAsync` (\_ -> return fp)
{- Encodes a FilePath into a Md5.Str, applying the filesystem encoding. -}
md5FilePath :: FilePath -> MD5.Str
md5FilePath = MD5.Str . _encodeFilePath
{- Decodes a ByteString into a FilePath, applying the filesystem encoding. -} {- Decodes a ByteString into a FilePath, applying the filesystem encoding. -}
decodeBS :: L.ByteString -> FilePath decodeBS :: L.ByteString -> FilePath
#ifndef mingw32_HOST_OS #ifndef mingw32_HOST_OS

View file

@ -1,8 +1,4 @@
{- Convenience wrapper around cryptohash/cryptonite. {- Convenience wrapper around cryptonite's hashing. -}
-
- SHA3 hashes are currently only enabled when using cryptonite,
- because of https://github.com/vincenthz/hs-cryptohash/issues/36
-}
module Utility.Hash ( module Utility.Hash (
sha1, sha1,

View file

@ -25,6 +25,8 @@ import Utility.Path
import Utility.FileMode import Utility.FileMode
import Utility.LockFile.LockStatus import Utility.LockFile.LockStatus
import Utility.ThreadScheduler import Utility.ThreadScheduler
import Utility.Hash
import Utility.FileSystemEncoding
import qualified Utility.LockFile.Posix as Posix import qualified Utility.LockFile.Posix as Posix
import System.IO import System.IO
@ -33,7 +35,6 @@ import Data.Maybe
import Data.List import Data.List
import Network.BSD import Network.BSD
import System.FilePath import System.FilePath
import Data.Hash.MD5
import Control.Applicative import Control.Applicative
import Prelude import Prelude
@ -99,7 +100,9 @@ sideLockFile lockfile = do
f <- absPath lockfile f <- absPath lockfile
let base = intercalate "_" (splitDirectories (makeRelative "/" f)) let base = intercalate "_" (splitDirectories (makeRelative "/" f))
let shortbase = reverse $ take 32 $ reverse base let shortbase = reverse $ take 32 $ reverse base
let md5sum = if base == shortbase then "" else md5s (Str base) let md5sum = if base == shortbase
then ""
else show (md5 (encodeBS base))
dir <- ifM (doesDirectoryExist "/dev/shm") dir <- ifM (doesDirectoryExist "/dev/shm")
( return "/dev/shm" ( return "/dev/shm"
, return "/tmp" , return "/tmp"

15
Utility/Tuple.hs Normal file
View file

@ -0,0 +1,15 @@
{- tuple utility functions
-
- Copyright 2017 Joey Hess <id@joeyh.name>
-
- License: BSD-2-clause
-}
fst3 :: (a,b,c) -> a
fst3 (a,b,c) = a
snd3 :: (a,b,c) -> b
snd3 (a,b,c) = b
thd3 :: (a,b,c) -> c
thd3 (a,b,c) = c

1
debian/control vendored
View file

@ -11,6 +11,7 @@ Build-Depends:
libghc-hslogger-dev, libghc-hslogger-dev,
libghc-pcre-light-dev, libghc-pcre-light-dev,
libghc-cryptonite-dev, libghc-cryptonite-dev,
libghc-memory-dev,
libghc-sandi-dev, libghc-sandi-dev,
libghc-utf8-string-dev, libghc-utf8-string-dev,
libghc-aws-dev (>= 0.9.2-2~), libghc-aws-dev (>= 0.9.2-2~),

View file

@ -0,0 +1,49 @@
[[!comment format=mdwn
username="joey"
subject="""comment 11"""
date="2017-05-15T21:56:52Z"
content="""
Switched from MissingH to cryptonite for md5. It did move md5 out of the top CPU spot but
the overall runtime didn't change much. Memory allocations did go down by a
good amount.
Updated profiles:
git-annex +RTS -p -RTS find
total time = 1.63 secs (1629 ticks @ 1000 us, 1 processor)
total alloc = 1,496,336,496 bytes (excludes profiling overheads)
COST CENTRE MODULE SRC %time %alloc
catchIO Utility.Exception Utility/Exception.hs:79:1-17 14.1 15.1
inAnnex'.checkindirect Annex.Content Annex/Content.hs:(108,9)-(119,39) 10.6 4.8
catches Control.Monad.Catch src/Control/Monad/Catch.hs:(432,1)-(436,76) 8.6 6.9
spanList Data.List.Utils src/Data/List/Utils.hs:(150,1)-(155,36) 6.7 11.1
isAnnexLink Annex.Link Annex/Link.hs:35:1-85 5.0 10.2
keyFile Annex.Locations Annex/Locations.hs:(456,1)-(462,19) 5.0 7.0
readish Utility.PartialPrelude Utility/PartialPrelude.hs:(48,1)-(50,20) 3.8 2.0
startswith Data.List.Utils src/Data/List/Utils.hs:103:1-23 3.6 2.3
splitc Utility.Misc Utility/Misc.hs:(52,1)-(54,25) 3.4 6.5
s2w8 Data.Bits.Utils src/Data/Bits/Utils.hs:65:1-15 2.6 6.4
keyPath Annex.Locations Annex/Locations.hs:(492,1)-(494,23) 2.5 4.4
fileKey.unesc Annex.Locations Annex/Locations.hs:(469,9)-(474,39) 2.0 3.5
copyAndFreeze Data.ByteArray.Methods Data/ByteArray/Methods.hs:(224,1)-(227,21) 1.8 0.5
git-annex +RTS -p -RTS find --not --in web
total time = 5.33 secs (5327 ticks @ 1000 us, 1 processor)
total alloc = 2,908,489,000 bytes (excludes profiling overheads)
COST CENTRE MODULE SRC %time %alloc
catObjectDetails.\ Git.CatFile Git/CatFile.hs:(80,72)-(88,97) 7.8 2.8
catchIO Utility.Exception Utility/Exception.hs:79:1-17 7.6 8.3
spanList Data.List.Utils src/Data/List/Utils.hs:(150,1)-(155,36) 5.8 9.1
readish Utility.PartialPrelude Utility/PartialPrelude.hs:(48,1)-(50,20) 4.5 4.0
parseResp Git.CatFile Git/CatFile.hs:(113,1)-(124,28) 4.4 2.9
readFileStrict Utility.Misc Utility/Misc.hs:33:1-59 3.7 1.6
catches Control.Monad.Catch src/Control/Monad/Catch.hs:(432,1)-(436,76) 3.1 3.6
encodeW8 Utility.FileSystemEncoding Utility/FileSystemEncoding.hs:(131,1)-(133,70) 3.1 2.3
"""]]

View file

@ -361,7 +361,8 @@ Executable git-annex
stm-chans, stm-chans,
securemem, securemem,
crypto-api, crypto-api,
cryptonite cryptonite,
memory
CC-Options: -Wall CC-Options: -Wall
GHC-Options: -Wall -fno-warn-tabs GHC-Options: -Wall -fno-warn-tabs
Extensions: PackageImports Extensions: PackageImports