git-annex/Utility/Format.hs

{- Formatted string handling.
 -
 - Copyright 2010, 2011 Joey Hess <id@joeyh.name>
 -
 - License: BSD-2-clause
 -}

module Utility.Format (
	Format,
	gen,
	format,
	decode_c,
	encode_c,
	prop_isomorphic_deencode
) where

import Text.Printf (printf)
import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord)
import Data.Maybe (fromMaybe)
import Data.Word (Word8)
import Data.List (isPrefixOf)
import qualified Codec.Binary.UTF8.String
import qualified Data.Map as M

import Utility.PartialPrelude

type FormatString = String

{- A format consists of a list of fragments. -}
type Format = [Frag]

{- A fragment is either a constant string,
 - or a variable, with a justification. -}
data Frag = Const String | Var String Justify
	deriving (Show)

data Justify = LeftJustified Int | RightJustified Int | UnJustified
	deriving (Show)

type Variables = M.Map String String

{- Expands a Format using some variables, generating a formatted string.
 - This can be repeatedly called, efficiently. -}
format :: Format -> Variables -> String
format f vars = concatMap expand f
  where
	expand (Const s) = s
	expand (Var name j)
		| "escaped_" `isPrefixOf` name =
			justify j $ encode_c_strict $
				getvar $ drop (length "escaped_") name
		| otherwise = justify j $ getvar name
	getvar name = fromMaybe "" $ M.lookup name vars
	justify UnJustified s        = s
	justify (LeftJustified i) s  = s ++ pad i s
	justify (RightJustified i) s = pad i s ++ s
	pad i s = take (i - length s) spaces
	spaces = repeat ' '

{- Generates a Format that can be used to expand variables in a
 - format string, such as "${foo} ${bar;10} ${baz;-10}\n"
 -
 - (This is the same type of format string used by dpkg-query.)
 -}
gen :: FormatString -> Format
gen = filter (not . empty) . fuse [] . scan [] . decode_c
  where
	-- The Format is built up in reverse, for efficiency,
	-- and can have many adjacent Consts. Fusing it fixes both
	-- problems.
	fuse f [] = f
	fuse f (Const c1:Const c2:vs) = fuse f $ Const (c2++c1) : vs
	fuse f (v:vs) = fuse (v:f) vs

	scan f (a:b:cs)
		| a == '$' && b == '{' = invar f [] cs
		| otherwise = scan (Const [a] : f ) (b:cs)
	scan f v = Const v : f

	invar f var [] = Const (novar var) : f
	invar f var (c:cs)
		| c == '}' = foundvar f var UnJustified cs
		| isAlphaNum c || c == '_' = invar f (c:var) cs
		| c == ';' = inpad "" f var cs
		| otherwise = scan ((Const $ novar $ c:var):f) cs

	inpad p f var (c:cs)
		| c == '}' = foundvar f var (readjustify $ reverse p) cs
		| otherwise = inpad (c:p) f var cs
	inpad p f var [] = Const (novar $ p++";"++var) : f
	readjustify = getjustify . fromMaybe 0 . readish
	getjustify i
		| i == 0 = UnJustified
		| i < 0 = LeftJustified (-1 * i)
		| otherwise = RightJustified i
	novar v = "${" ++ reverse v
	foundvar f v p = scan (Var (reverse v) p : f)

empty :: Frag -> Bool
empty (Const "") = True
empty _ = False

{- Decodes a C-style encoding, where \n is a newline, \NNN is an octal
 - encoded character, and \xNN is a hex encoded character.
 -}
decode_c :: FormatString -> String
decode_c [] = []
decode_c s = unescape ("", s)
  where
	e = '\\'
	unescape (b, []) = b
	-- look for escapes starting with '\'
	unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair)
	  where
		pair = span (/= e) v
	isescape x = x == e
	handle (x:'x':n1:n2:rest)
		| isescape x && allhex = (fromhex, rest)
	  where
		allhex = isHexDigit n1 && isHexDigit n2
		fromhex = [chr $ readhex [n1, n2]]
		readhex h = Prelude.read $ "0x" ++ h :: Int
	handle (x:n1:n2:n3:rest)
		| isescape x && alloctal = (fromoctal, rest)
	  where
		alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3
		fromoctal = [chr $ readoctal [n1, n2, n3]]
		readoctal o = Prelude.read $ "0o" ++ o :: Int
	-- \C is used for a few special characters
	handle (x:nc:rest)
		| isescape x = ([echar nc], rest)
	  where
		echar 'a' = '\a'
		echar 'b' = '\b'
		echar 'f' = '\f'
		echar 'n' = '\n'
		echar 'r' = '\r'
		echar 't' = '\t'
		echar 'v' = '\v'
		echar a = a
	handle n = ("", n)

{- Inverse of decode_c. -}
encode_c :: String -> FormatString
encode_c = encode_c' (const False)

{- Encodes more strictly, including whitespace. -}
encode_c_strict :: String -> FormatString
encode_c_strict = encode_c' isSpace

encode_c' :: (Char -> Bool) -> String -> FormatString
encode_c' p = concatMap echar
  where
	e c = '\\' : [c]
	echar '\a' = e 'a'
	echar '\b' = e 'b'
	echar '\f' = e 'f'
	echar '\n' = e 'n'
	echar '\r' = e 'r'
	echar '\t' = e 't'
	echar '\v' = e 'v'
	echar '\\' = e '\\'
	echar '"'  = e '"'
	echar c
		| ord c < 0x20 = e_asc c -- low ascii
		| ord c >= 256 = e_utf c -- unicode
		| ord c > 0x7E = e_asc c -- high ascii
		| p c          = e_asc c -- unprintable ascii
		| otherwise    = [c]     -- printable ascii
	-- unicode character is decomposed to individual Word8s,
	-- and each is shown in octal
	e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
	e_asc c = showoctal $ ord c
	showoctal i = '\\' : printf "%03o" i

{- for quickcheck -}
prop_isomorphic_deencode :: String -> Bool
prop_isomorphic_deencode s = s == decode_c (encode_c s)
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`{- Formatted string handling.`
			`-`
update my email address and homepage url 2015-01-21 16:50:09 +00:00			`- Copyright 2010, 2011 Joey Hess <id@joeyh.name>`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`-`
relicense general utility library code to BSD Omitted a couple of files what have had significant contributions from others. 2014-05-10 14:01:27 +00:00			`- License: BSD-2-clause`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`-}`

handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`module Utility.Format (`
			`Format,`
			`gen,`
			`format,`
			`decode_c,`
			`encode_c,`
fix use of hifalutin terminology 2015-11-16 18:37:31 +00:00			`prop_isomorphic_deencode`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`) where`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00
			`import Text.Printf (printf)`
gpg secret keys list parsing Note that Utility.Format.prop_idempotent_deencode does not hold now that hex escaped characters are supported. quickcheck fails to notice this, so I have left it as-is for now. 2013-09-16 16:57:39 +00:00			`import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord)`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`import Data.Maybe (fromMaybe)`
			`import Data.Word (Word8)`
add escape_var hack Makes it easy to find files with duplicate contents, anyway.. :) 2011-12-23 01:23:11 +00:00			`import Data.List (isPrefixOf)`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`import qualified Codec.Binary.UTF8.String`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`import qualified Data.Map as M`

			`import Utility.PartialPrelude`

			`type FormatString = String`

better data type 2011-12-22 23:56:31 +00:00			`{- A format consists of a list of fragments. -}`
			`type Format = [Frag]`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00
cleanup 2011-12-23 04:36:25 +00:00			`{- A fragment is either a constant string,`
			`- or a variable, with a justification. -}`
			`data Frag = Const String \| Var String Justify`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`deriving (Show)`

cleanup 2011-12-23 04:36:25 +00:00			`data Justify = LeftJustified Int \| RightJustified Int \| UnJustified`
			`deriving (Show)`
better data type 2011-12-22 23:56:31 +00:00
treak 2012-01-21 06:24:12 +00:00			`type Variables = M.Map String String`

add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`{- Expands a Format using some variables, generating a formatted string.`
			`- This can be repeatedly called, efficiently. -}`
treak 2012-01-21 06:24:12 +00:00			`format :: Format -> Variables -> String`
better data type 2011-12-22 23:56:31 +00:00			`format f vars = concatMap expand f`
finished where indentation changes 2012-12-13 04:24:19 +00:00			`where`
			`expand (Const s) = s`
			`expand (Var name j)`
			\| "escaped_" `isPrefixOf` name =
			`justify j $ encode_c_strict $`
			`getvar $ drop (length "escaped_") name`
			`\| otherwise = justify j $ getvar name`
			`getvar name = fromMaybe "" $ M.lookup name vars`
			`justify UnJustified s = s`
			`justify (LeftJustified i) s = s ++ pad i s`
			`justify (RightJustified i) s = pad i s ++ s`
			`pad i s = take (i - length s) spaces`
			`spaces = repeat ' '`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00
			`{- Generates a Format that can be used to expand variables in a`
better data type 2011-12-22 23:56:31 +00:00			`- format string, such as "${foo} ${bar;10} ${baz;-10}\n"`
add a text formatter This is built for speed; a format string is parsed once, generating a Format, that can be applied repeatedly to different sets of variables to generate output. 2011-12-22 21:59:14 +00:00			`-`
			`- (This is the same type of format string used by dpkg-query.)`
			`-}`
			`gen :: FormatString -> Format`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`gen = filter (not . empty) . fuse [] . scan [] . decode_c`
finished where indentation changes 2012-12-13 04:24:19 +00:00			`where`
			`-- The Format is built up in reverse, for efficiency,`
			`-- and can have many adjacent Consts. Fusing it fixes both`
			`-- problems.`
			`fuse f [] = f`
			`fuse f (Const c1:Const c2:vs) = fuse f $ Const (c2++c1) : vs`
			`fuse f (v:vs) = fuse (v:f) vs`

			`scan f (a:b:cs)`
			`\| a == '$' && b == '{' = invar f [] cs`
			`\| otherwise = scan (Const [a] : f ) (b:cs)`
			`scan f v = Const v : f`

			`invar f var [] = Const (novar var) : f`
			`invar f var (c:cs)`
			`\| c == '}' = foundvar f var UnJustified cs`
			`\| isAlphaNum c \|\| c == '_' = invar f (c:var) cs`
			`\| c == ';' = inpad "" f var cs`
			`\| otherwise = scan ((Const $ novar $ c:var):f) cs`

			`inpad p f var (c:cs)`
			`\| c == '}' = foundvar f var (readjustify $ reverse p) cs`
			`\| otherwise = inpad (c:p) f var cs`
			`inpad p f var [] = Const (novar $ p++";"++var) : f`
			`readjustify = getjustify . fromMaybe 0 . readish`
			`getjustify i`
			`\| i == 0 = UnJustified`
			`\| i < 0 = LeftJustified (-1 * i)`
			`\| otherwise = RightJustified i`
			`novar v = "${" ++ reverse v`
			`foundvar f v p = scan (Var (reverse v) p : f)`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00
reorg 2011-12-23 00:21:42 +00:00			`empty :: Frag -> Bool`
			`empty (Const "") = True`
			`empty _ = False`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00
			`{- Decodes a C-style encoding, where \n is a newline, \NNN is an octal`
gpg secret keys list parsing Note that Utility.Format.prop_idempotent_deencode does not hold now that hex escaped characters are supported. quickcheck fails to notice this, so I have left it as-is for now. 2013-09-16 16:57:39 +00:00			`- encoded character, and \xNN is a hex encoded character.`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`-}`
better types 2016-02-14 20:26:39 +00:00			`decode_c :: FormatString -> String`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00			`decode_c [] = []`
			`decode_c s = unescape ("", s)`
finished where indentation changes 2012-12-13 04:24:19 +00:00			`where`
			`e = '\\'`
			`unescape (b, []) = b`
			`-- look for escapes starting with '\'`
			`unescape (b, v) = b ++ fst pair ++ unescape (handle $ snd pair)`
			`where`
			`pair = span (/= e) v`
			`isescape x = x == e`
gpg secret keys list parsing Note that Utility.Format.prop_idempotent_deencode does not hold now that hex escaped characters are supported. quickcheck fails to notice this, so I have left it as-is for now. 2013-09-16 16:57:39 +00:00			`handle (x:'x':n1:n2:rest)`
			`\| isescape x && allhex = (fromhex, rest)`
			`where`
fix some mixed space+tab indentation This fixes all instances of " \t" in the code base. Most common case seems to be after a "where" line; probably vim copied the two space layout of that line. Done as a background task while listening to episode 2 of the Type Theory podcast. 2014-10-09 18:53:13 +00:00			`allhex = isHexDigit n1 && isHexDigit n2`
gpg secret keys list parsing Note that Utility.Format.prop_idempotent_deencode does not hold now that hex escaped characters are supported. quickcheck fails to notice this, so I have left it as-is for now. 2013-09-16 16:57:39 +00:00			`fromhex = [chr $ readhex [n1, n2]]`
			`readhex h = Prelude.read $ "0x" ++ h :: Int`
finished where indentation changes 2012-12-13 04:24:19 +00:00			`handle (x:n1:n2:n3:rest)`
			`\| isescape x && alloctal = (fromoctal, rest)`
			`where`
			`alloctal = isOctDigit n1 && isOctDigit n2 && isOctDigit n3`
			`fromoctal = [chr $ readoctal [n1, n2, n3]]`
			`readoctal o = Prelude.read $ "0o" ++ o :: Int`
			`-- \C is used for a few special characters`
			`handle (x:nc:rest)`
			`\| isescape x = ([echar nc], rest)`
			`where`
			`echar 'a' = '\a'`
			`echar 'b' = '\b'`
			`echar 'f' = '\f'`
			`echar 'n' = '\n'`
			`echar 'r' = '\r'`
			`echar 't' = '\t'`
			`echar 'v' = '\v'`
			`echar a = a`
			`handle n = ("", n)`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00
add escape_var hack Makes it easy to find files with duplicate contents, anyway.. :) 2011-12-23 01:23:11 +00:00			`{- Inverse of decode_c. -}`
better types 2016-02-14 20:26:39 +00:00			`encode_c :: String -> FormatString`
cleanup 2011-12-23 04:36:25 +00:00			`encode_c = encode_c' (const False)`

			`{- Encodes more strictly, including whitespace. -}`
better types 2016-02-14 20:26:39 +00:00			`encode_c_strict :: String -> FormatString`
cleanup 2011-12-23 04:36:25 +00:00			`encode_c_strict = encode_c' isSpace`

better types 2016-02-14 20:26:39 +00:00			`encode_c' :: (Char -> Bool) -> String -> FormatString`
cleanup 2011-12-23 04:36:25 +00:00			`encode_c' p = concatMap echar`
finished where indentation changes 2012-12-13 04:24:19 +00:00			`where`
			`e c = '\\' : [c]`
			`echar '\a' = e 'a'`
			`echar '\b' = e 'b'`
			`echar '\f' = e 'f'`
			`echar '\n' = e 'n'`
			`echar '\r' = e 'r'`
			`echar '\t' = e 't'`
			`echar '\v' = e 'v'`
			`echar '\\' = e '\\'`
			`echar '"' = e '"'`
			`echar c`
			`\| ord c < 0x20 = e_asc c -- low ascii`
			`\| ord c >= 256 = e_utf c -- unicode`
			`\| ord c > 0x7E = e_asc c -- high ascii`
			`\| p c = e_asc c -- unprintable ascii`
			`\| otherwise = [c] -- printable ascii`
			`-- unicode character is decomposed to individual Word8s,`
			`-- and each is shown in octal`
			`e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])`
			`e_asc c = showoctal $ ord c`
			`showoctal i = '\\' : printf "%03o" i`
handle C-style escapes in Format I was happily able to repurpose some code from Git.Filename to handle this. I remember writing that code... a whole afternoon at a coffee shop, after which I felt I'd struggled with Haskell and git, and sorta lost, in needing to write this nasty peice of code. But was also pleased at the use of a pair of functions and quickcheck that allowed me to get it 100% right. So, turns out I not only got it right, but the code wasn't as special-purpose as I'd feared. Yay! 2011-12-23 00:14:35 +00:00
			`{- for quickcheck -}`
fix use of hifalutin terminology 2015-11-16 18:37:31 +00:00			`prop_isomorphic_deencode :: String -> Bool`
			`prop_isomorphic_deencode s = s == decode_c (encode_c s)`