git-annex/Annex/DirHashes.hs

{- git-annex file locations
 -
 - Copyright 2010-2017 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU GPL version 3 or higher.
 -}

module Annex.DirHashes (
	Hasher,
	HashLevels(..),
	objectHashLevels,
	branchHashLevels,
	branchHashDir,
	dirHashes,
	hashDirMixed,
	hashDirLower,
	display_32bits_as_dir
) where

import Data.Bits
import Data.Word
import Data.Default
import qualified Data.ByteArray

import Common
import Key
import Types.GitConfig
import Types.Difference
import Utility.FileSystemEncoding
import Utility.Hash

type Hasher = Key -> FilePath

-- Number of hash levels to use. 2 is the default.
newtype HashLevels = HashLevels Int

instance Default HashLevels where
	def = HashLevels 2

objectHashLevels :: GitConfig -> HashLevels
objectHashLevels = configHashLevels OneLevelObjectHash

branchHashLevels :: GitConfig -> HashLevels
branchHashLevels = configHashLevels OneLevelBranchHash

configHashLevels :: Difference -> GitConfig -> HashLevels
configHashLevels d config
	| hasDifference d (annexDifferences config) = HashLevels 1
	| otherwise = def

branchHashDir :: GitConfig -> Key -> String
branchHashDir = hashDirLower . branchHashLevels

{- Two different directory hashes may be used. The mixed case hash
 - came first, and is fine, except for the problem of case-strict
 - filesystems such as Linux VFAT (mounted with shortname=mixed),
 - which do not allow using a directory "XX" when "xx" already exists.
 - To support that, some git-annex repositories use the lower case-hash.
 - All special remotes use the lower-case hash for new data, but old data
 - may still used the mixed case hash. -}
dirHashes :: [HashLevels -> Hasher]
dirHashes = [hashDirLower, hashDirMixed]

hashDirs :: HashLevels -> Int -> String -> FilePath
hashDirs (HashLevels 1) sz s = addTrailingPathSeparator $ take sz s
hashDirs _ sz s = addTrailingPathSeparator $ take sz s </> drop sz s

hashDirLower :: HashLevels -> Hasher
hashDirLower n k = hashDirs n 3 $ take 6 $ show $ md5 $
	encodeBS $ key2file $ nonChunkKey k

{- This was originally using Data.Hash.MD5 from MissingH. This new version
- is faster, but ugly as it has to replicate the 4 Word32's that produced. -}
hashDirMixed :: HashLevels -> Hasher
hashDirMixed n k = hashDirs n 2 $ take 4 $ concatMap display_32bits_as_dir $
	encodeWord32 $ map fromIntegral $ Data.ByteArray.unpack $
		Utility.Hash.md5 $ encodeBS $ key2file $ nonChunkKey k
  where
	encodeWord32 (b1:b2:b3:b4:rest) =
		(shiftL b4 24 .|. shiftL b3 16 .|. shiftL b2 8 .|. b1)
		: encodeWord32 rest
	encodeWord32 _ = []

{- modified version of display_32bits_as_hex from Data.Hash.MD5
 - in MissingH
 -   Copyright (C) 2001 Ian Lynagh 
 -   License: Either BSD or GPL
 -}
display_32bits_as_dir :: Word32 -> String
display_32bits_as_dir w = trim $ swap_pairs cs
  where
	-- Need 32 characters to use. To avoid inaverdently making
	-- a real word, use letters that appear less frequently.
	chars = ['0'..'9'] ++ "zqjxkmvwgpfZQJXKMVWGPF"
	cs = map (\x -> getc $ (shiftR w (6*x)) .&. 31) [0..7]
	getc n = chars !! fromIntegral n
	swap_pairs (x1:x2:xs) = x2:x1:swap_pairs xs
	swap_pairs _ = []
	-- Last 2 will always be 00, so omit.
	trim = take 6
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`{- git-annex file locations`
			`-`
stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`- Copyright 2010-2017 Joey Hess <id@joeyh.name>`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`-`
			`- Licensed under the GNU GPL version 3 or higher.`
			`-}`

			`module Annex.DirHashes (`
			`Hasher,`
			`HashLevels(..),`
			`objectHashLevels,`
implement annex.tune.branchhash1 I hope this doesn't impact speed much -- it does have to pull out a value from Annex state every time it accesses the branch now. The test case I dropped has never caught any problems that I can remember, and would have been rather difficult to convert. 2015-01-28 21:17:26 +00:00			`branchHashLevels,`
			`branchHashDir,`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`dirHashes,`
			`hashDirMixed,`
			`hashDirLower,`
followup 2016-09-29 15:33:42 +00:00			`display_32bits_as_dir`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`) where`

			`import Data.Bits`
			`import Data.Word`
			`import Data.Default`
stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`import qualified Data.ByteArray`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00
			`import Common`
factor non-type stuff out of Key 2017-02-24 17:42:30 +00:00			`import Key`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`import Types.GitConfig`
			`import Types.Difference`
Always use filesystem encoding for all file and handle reads and writes. This is a big scary change. I have convinced myself it should be safe. I hope! 2016-12-24 18:46:31 +00:00			`import Utility.FileSystemEncoding`
stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`import Utility.Hash`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00
			`type Hasher = Key -> FilePath`

			`-- Number of hash levels to use. 2 is the default.`
			`newtype HashLevels = HashLevels Int`

			`instance Default HashLevels where`
			`def = HashLevels 2`

			`objectHashLevels :: GitConfig -> HashLevels`
implement annex.tune.branchhash1 I hope this doesn't impact speed much -- it does have to pull out a value from Annex state every time it accesses the branch now. The test case I dropped has never caught any problems that I can remember, and would have been rather difficult to convert. 2015-01-28 21:17:26 +00:00			`objectHashLevels = configHashLevels OneLevelObjectHash`

			`branchHashLevels :: GitConfig -> HashLevels`
			`branchHashLevels = configHashLevels OneLevelBranchHash`

			`configHashLevels :: Difference -> GitConfig -> HashLevels`
			`configHashLevels d config`
use a Set 2015-01-28 22:17:10 +00:00			`\| hasDifference d (annexDifferences config) = HashLevels 1`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`\| otherwise = def`

implement annex.tune.branchhash1 I hope this doesn't impact speed much -- it does have to pull out a value from Annex state every time it accesses the branch now. The test case I dropped has never caught any problems that I can remember, and would have been rather difficult to convert. 2015-01-28 21:17:26 +00:00			`branchHashDir :: GitConfig -> Key -> String`
a few hlints 2015-04-11 04:10:34 +00:00			`branchHashDir = hashDirLower . branchHashLevels`
implement annex.tune.branchhash1 I hope this doesn't impact speed much -- it does have to pull out a value from Annex state every time it accesses the branch now. The test case I dropped has never caught any problems that I can remember, and would have been rather difficult to convert. 2015-01-28 21:17:26 +00:00
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`{- Two different directory hashes may be used. The mixed case hash`
			`- came first, and is fine, except for the problem of case-strict`
			`- filesystems such as Linux VFAT (mounted with shortname=mixed),`
			`- which do not allow using a directory "XX" when "xx" already exists.`
clarify comment 2018-11-30 16:37:45 +00:00			`- To support that, some git-annex repositories use the lower case-hash.`
			`- All special remotes use the lower-case hash for new data, but old data`
			`- may still used the mixed case hash. -}`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`dirHashes :: [HashLevels -> Hasher]`
			`dirHashes = [hashDirLower, hashDirMixed]`

			`hashDirs :: HashLevels -> Int -> String -> FilePath`
			`hashDirs (HashLevels 1) sz s = addTrailingPathSeparator $ take sz s`
			`hashDirs _ sz s = addTrailingPathSeparator $ take sz s </> drop sz s`

stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`hashDirLower :: HashLevels -> Hasher`
			`hashDirLower n k = hashDirs n 3 $ take 6 $ show $ md5 $`
			`encodeBS $ key2file $ nonChunkKey k`

			`{- This was originally using Data.Hash.MD5 from MissingH. This new version`
			`- is faster, but ugly as it has to replicate the 4 Word32's that produced. -}`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`hashDirMixed :: HashLevels -> Hasher`
stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`hashDirMixed n k = hashDirs n 2 $ take 4 $ concatMap display_32bits_as_dir $`
			`encodeWord32 $ map fromIntegral $ Data.ByteArray.unpack $`
			`Utility.Hash.md5 $ encodeBS $ key2file $ nonChunkKey k`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`where`
stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`encodeWord32 (b1:b2:b3:b4:rest) =`
			`(shiftL b4 24 .\|. shiftL b3 16 .\|. shiftL b2 8 .\|. b1)`
			`: encodeWord32 rest`
			`encodeWord32 _ = []`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00
			`{- modified version of display_32bits_as_hex from Data.Hash.MD5`
stop using MissingH for MD5 Cryptonite is faster and allocates less, and I want to get rid of MissingH use. Note that the new dependency on memory is free; it's a dependency of cryptonite. This commit was supported by the NSF-funded DataLad project. 2017-05-15 22:10:13 +00:00			`- in MissingH`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`- Copyright (C) 2001 Ian Lynagh`
			`- License: Either BSD or GPL`
			`-}`
			`display_32bits_as_dir :: Word32 -> String`
			`display_32bits_as_dir w = trim $ swap_pairs cs`
followup 2016-09-29 15:33:42 +00:00			`where`
implement annex.tune.objecthashlower Split out Annex.DirHashes which never really belonged in Locations. 2015-01-28 20:51:40 +00:00			`-- Need 32 characters to use. To avoid inaverdently making`
			`-- a real word, use letters that appear less frequently.`
			`chars = ['0'..'9'] ++ "zqjxkmvwgpfZQJXKMVWGPF"`
			`cs = map (\x -> getc $ (shiftR w (6*x)) .&. 31) [0..7]`
			`getc n = chars !! fromIntegral n`
			`swap_pairs (x1:x2:xs) = x2:x1:swap_pairs xs`
			`swap_pairs _ = []`
			`-- Last 2 will always be 00, so omit.`
			`trim = take 6`