add escape_var hack

Makes it easy to find files with duplicate contents, anyway.. :)
This commit is contained in:
Joey Hess 2011-12-22 21:23:11 -04:00
parent 13a0c292b3
commit 7227dd8f21
4 changed files with 58 additions and 19 deletions

View file

@ -18,6 +18,8 @@ import Text.Printf (printf)
import Data.Char (isAlphaNum, isOctDigit, chr, ord) import Data.Char (isAlphaNum, isOctDigit, chr, ord)
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Data.Word (Word8) import Data.Word (Word8)
import Data.List (isPrefixOf)
import Data.String.Utils (replace)
import qualified Codec.Binary.UTF8.String import qualified Codec.Binary.UTF8.String
import qualified Data.Map as M import qualified Data.Map as M
@ -42,8 +44,16 @@ format :: Format -> M.Map String String -> String
format f vars = concatMap expand f format f vars = concatMap expand f
where where
expand (Const s) = s expand (Const s) = s
expand (Var name padding) = justify padding $ expand (Var name padding) = justify padding $ getvar name
fromMaybe "" $ M.lookup name vars getvar name
| "escaped_" `isPrefixOf` name =
-- escape whitespace too
replace " " (e_asc ' ') $
replace "\t" (e_asc '\t') $
encode_c $
getvar' $ drop (length "escaped_") name
| otherwise = getvar' name
getvar' name = fromMaybe "" $ M.lookup name vars
justify p s justify p s
| p > 0 = take (p - length s) spaces ++ s | p > 0 = take (p - length s) spaces ++ s
| p < 0 = s ++ take (-1 * (length s + p)) spaces | p < 0 = s ++ take (-1 * (length s + p)) spaces
@ -73,9 +83,9 @@ gen = filter (not . empty) . fuse [] . scan [] . decode_c
invar f var [] = Const (novar var) : f invar f var [] = Const (novar var) : f
invar f var (c:cs) invar f var (c:cs)
| c == '}' = foundvar f var 0 cs | c == '}' = foundvar f var 0 cs
| isAlphaNum c = invar f (c:var) cs | isAlphaNum c || c == '_' = invar f (c:var) cs
| c == ';' = inpad "" f var cs | c == ';' = inpad "" f var cs
| otherwise = scan ((Const $ reverse $ novar $ c:var):f) cs | otherwise = scan ((Const $ novar $ c:var):f) cs
inpad p f var (c:cs) inpad p f var (c:cs)
| c == '}' = foundvar f var (readpad $ reverse p) cs | c == '}' = foundvar f var (readpad $ reverse p) cs
@ -127,7 +137,7 @@ decode_c s = unescape ("", s)
echar a = a echar a = a
handle n = ("", n) handle n = ("", n)
{- Should not need to use this, except for testing decode_c. -} {- Inverse of decode_c. -}
encode_c :: FormatString -> FormatString encode_c :: FormatString -> FormatString
encode_c s = concatMap echar s encode_c s = concatMap echar s
where where
@ -141,17 +151,23 @@ encode_c s = concatMap echar s
echar '\v' = e 'v' echar '\v' = e 'v'
echar '\\' = e '\\' echar '\\' = e '\\'
echar '"' = e '"' echar '"' = e '"'
echar x echar c
| ord x < 0x20 = e_num x -- low ascii | ord c < 0x20 = e_asc c -- low ascii
| ord x >= 256 = e_utf x | ord c >= 256 = e_utf c
| ord x > 0x7E = e_num x -- high ascii | ord c > 0x7E = e_asc c -- high ascii
| otherwise = [x] -- printable ascii | otherwise = [c] -- printable ascii
where
showoctal i = '\\' : printf "%03o" i -- unicode character is decomposed to individual Word8s,
e_num c = showoctal $ ord c -- and each is shown in octal
-- unicode character is decomposed to e_utf :: Char -> String
-- Word8s and each is shown in octal e_utf c = showoctal . toInteger =<<
e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8]) (Codec.Binary.UTF8.String.encode [c] :: [Word8])
e_asc :: Char -> String
e_asc c = showoctal $ toInteger $ ord c
showoctal :: Integer -> String
showoctal i = '\\' : printf "%03o" i
{- for quickcheck -} {- for quickcheck -}
prop_idempotent_deencode :: String -> Bool prop_idempotent_deencode :: String -> Bool

View file

@ -437,8 +437,10 @@ subdirectories).
Specifies a custom output format. The value is a format string, Specifies a custom output format. The value is a format string,
in which '${var}' is expanded to the value of a variable. To right-justify in which '${var}' is expanded to the value of a variable. To right-justify
a variable with whitespace, use '${var;width}' ; to left-justify a variable with whitespace, use '${var;width}' ; to left-justify
a variable, use '${var;-width}'. Also, '\\n' is a newline, '\\000' is a NULL, a variable, use '${var;-width}'; to escape unusual characters in a variable,
etc. use '${escaped_var}'
Also, '\\n' is a newline, '\\000' is a NULL, etc.
* -c name=value * -c name=value

View file

@ -0,0 +1,21 @@
Maybe you had a lot of files scattered around on different drives, and you
added them all into a single git-annex repository. Some of the files are
surely duplicates of others.
While git-annex stores the file contents efficiently, it would still
help in cleaning up this mess if you could find, and perhaps remove
the duplicate files.
Here's a command line that will show duplicate sets of files grouped together:
git annex find --include '*' --format='${file} ${escaped_key}\n' | \
sort -k2 | uniq --all-repeated=separate -f1 | \
sed 's/ [^ ]*$//'
Here's a command line that will remove one of each duplicate set of files:
git annex find --include '*' --format='${file} ${escaped_key}\n' | \
sort -k2 | uniq --repeated -f1 | sed 's/ [^ ]*$//' | \
xargs -d '\n' git rm
--[[Joey]]

View file

@ -25,4 +25,4 @@ I want this because I have copies of various of mine (photos, in particular) sca
(As I write this, I realize it's possible to parse the destination of the symlink in a way that does this..) (As I write this, I realize it's possible to parse the destination of the symlink in a way that does this..)
> > [[done]]; see [[tips/finding_duplicate_files]] --[[Joey]]