From 7227dd8f21f24c2ccadd38e1a3dec7b888a23e92 Mon Sep 17 00:00:00 2001
From: Joey Hess <joey@kitenet.net>
Date: Thu, 22 Dec 2011 21:23:11 -0400
Subject: [PATCH] add escape_var hack

Makes it easy to find files with duplicate contents, anyway.. :)
---
 Utility/Format.hs                             | 48 ++++++++++++-------
 doc/git-annex.mdwn                            |  6 ++-
 doc/tips/finding_duplicate_files.mdwn         | 21 ++++++++
 ...4___command_that_will_skip_duplicates.mdwn |  2 +-
 4 files changed, 58 insertions(+), 19 deletions(-)
 create mode 100644 doc/tips/finding_duplicate_files.mdwn

diff --git a/Utility/Format.hs b/Utility/Format.hs
index d167087011..c0ba465806 100644
--- a/Utility/Format.hs
+++ b/Utility/Format.hs
@@ -18,6 +18,8 @@ import Text.Printf (printf)
 import Data.Char (isAlphaNum, isOctDigit, chr, ord)
 import Data.Maybe (fromMaybe)
 import Data.Word (Word8)
+import Data.List (isPrefixOf)
+import Data.String.Utils (replace)
 import qualified Codec.Binary.UTF8.String
 import qualified Data.Map as M
 
@@ -42,8 +44,16 @@ format :: Format -> M.Map String String -> String
 format f vars = concatMap expand f
 	where
 		expand (Const s) = s
-		expand (Var name padding) = justify padding $
-			fromMaybe "" $ M.lookup name vars
+		expand (Var name padding) = justify padding $ getvar name
+		getvar name
+			| "escaped_" `isPrefixOf` name = 
+				-- escape whitespace too
+				replace " " (e_asc ' ') $
+				replace "\t" (e_asc '\t') $
+				encode_c $
+				getvar' $ drop (length "escaped_") name
+			| otherwise = getvar' name
+		getvar' name = fromMaybe "" $ M.lookup name vars
 		justify p s
 			| p > 0 = take (p - length s) spaces ++ s
 			| p < 0 = s ++ take (-1 * (length s + p)) spaces
@@ -73,9 +83,9 @@ gen = filter (not . empty) . fuse [] . scan [] . decode_c
 		invar f var [] = Const (novar var) : f
 		invar f var (c:cs)
 			| c == '}' = foundvar f var 0 cs
-			| isAlphaNum c = invar f (c:var) cs
+			| isAlphaNum c || c == '_' = invar f (c:var) cs
 			| c == ';' = inpad "" f var cs
-			| otherwise = scan ((Const $ reverse $ novar $ c:var):f) cs
+			| otherwise = scan ((Const $ novar $ c:var):f) cs
 
 		inpad p f var (c:cs)
 			| c == '}' = foundvar f var (readpad $ reverse p) cs
@@ -127,7 +137,7 @@ decode_c s = unescape ("", s)
 				echar a = a
 		handle n = ("", n)
 
-{- Should not need to use this, except for testing decode_c. -}
+{- Inverse of decode_c. -}
 encode_c :: FormatString -> FormatString
 encode_c s = concatMap echar s
 	where
@@ -141,17 +151,23 @@ encode_c s = concatMap echar s
 		echar '\v' = e 'v'
 		echar '\\' = e '\\'
 		echar '"'  = e '"'
-		echar x
-			| ord x < 0x20 = e_num x -- low ascii
-			| ord x >= 256 = e_utf x
-			| ord x > 0x7E = e_num x -- high ascii
-			| otherwise = [x]        -- printable ascii
-			where 
-				showoctal i = '\\' : printf "%03o" i
-				e_num c = showoctal $ ord c
-				-- unicode character is decomposed to
-				-- Word8s and each is shown in octal
-				e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
+		echar c
+			| ord c < 0x20 = e_asc c -- low ascii
+			| ord c >= 256 = e_utf c
+			| ord c > 0x7E = e_asc c -- high ascii
+			| otherwise = [c]        -- printable ascii
+
+-- unicode character is decomposed to individual Word8s,
+-- and each is shown in octal
+e_utf :: Char -> String
+e_utf c = showoctal . toInteger =<<
+	(Codec.Binary.UTF8.String.encode [c] :: [Word8])
+		
+e_asc :: Char -> String
+e_asc c = showoctal $ toInteger $ ord c
+
+showoctal :: Integer -> String				
+showoctal i = '\\' : printf "%03o" i
 
 {- for quickcheck -}
 prop_idempotent_deencode :: String -> Bool
diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn
index 7ad3fac69e..2d0d2597ea 100644
--- a/doc/git-annex.mdwn
+++ b/doc/git-annex.mdwn
@@ -437,8 +437,10 @@ subdirectories).
   Specifies a custom output format. The value is a format string, 
   in which '${var}' is expanded to the value of a variable. To right-justify
   a variable with whitespace, use '${var;width}' ; to left-justify
-  a variable, use '${var;-width}'. Also, '\\n' is a newline, '\\000' is a NULL,
-  etc.
+  a variable, use '${var;-width}'; to escape unusual characters in a variable,
+  use '${escaped_var}'
+
+  Also, '\\n' is a newline, '\\000' is a NULL, etc.
 
 * -c name=value
 
diff --git a/doc/tips/finding_duplicate_files.mdwn b/doc/tips/finding_duplicate_files.mdwn
new file mode 100644
index 0000000000..94fc85400e
--- /dev/null
+++ b/doc/tips/finding_duplicate_files.mdwn
@@ -0,0 +1,21 @@
+Maybe you had a lot of files scattered around on different drives, and you
+added them all into a single git-annex repository. Some of the files are
+surely duplicates of others.
+
+While git-annex stores the file contents efficiently, it would still
+help in cleaning up this mess if you could find, and perhaps remove
+the duplicate files.
+
+Here's a command line that will show duplicate sets of files grouped together:
+
+	git annex find --include '*' --format='${file} ${escaped_key}\n' | \
+		sort -k2 | uniq --all-repeated=separate -f1 | \
+		sed 's/ [^ ]*$//'
+
+Here's a command line that will remove one of each duplicate set of files:
+
+	git annex find --include '*' --format='${file} ${escaped_key}\n' | \
+		sort -k2 | uniq --repeated -f1 | sed 's/ [^ ]*$//' | \
+		xargs -d '\n' git rm
+
+--[[Joey]] 
diff --git a/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn b/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn
index ca18afc578..9336535788 100644
--- a/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn
+++ b/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn
@@ -25,4 +25,4 @@ I want this because I have copies of various of mine (photos, in particular) sca
 
 (As I write this, I realize it's possible to parse the destination of the symlink in a way that does this..)
 
-> 
+> [[done]]; see [[tips/finding_duplicate_files]] --[[Joey]]