From 7227dd8f21f24c2ccadd38e1a3dec7b888a23e92 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 22 Dec 2011 21:23:11 -0400 Subject: [PATCH] add escape_var hack Makes it easy to find files with duplicate contents, anyway.. :) --- Utility/Format.hs | 48 ++++++++++++------- doc/git-annex.mdwn | 6 ++- doc/tips/finding_duplicate_files.mdwn | 21 ++++++++ ...4___command_that_will_skip_duplicates.mdwn | 2 +- 4 files changed, 58 insertions(+), 19 deletions(-) create mode 100644 doc/tips/finding_duplicate_files.mdwn diff --git a/Utility/Format.hs b/Utility/Format.hs index d167087011..c0ba465806 100644 --- a/Utility/Format.hs +++ b/Utility/Format.hs @@ -18,6 +18,8 @@ import Text.Printf (printf) import Data.Char (isAlphaNum, isOctDigit, chr, ord) import Data.Maybe (fromMaybe) import Data.Word (Word8) +import Data.List (isPrefixOf) +import Data.String.Utils (replace) import qualified Codec.Binary.UTF8.String import qualified Data.Map as M @@ -42,8 +44,16 @@ format :: Format -> M.Map String String -> String format f vars = concatMap expand f where expand (Const s) = s - expand (Var name padding) = justify padding $ - fromMaybe "" $ M.lookup name vars + expand (Var name padding) = justify padding $ getvar name + getvar name + | "escaped_" `isPrefixOf` name = + -- escape whitespace too + replace " " (e_asc ' ') $ + replace "\t" (e_asc '\t') $ + encode_c $ + getvar' $ drop (length "escaped_") name + | otherwise = getvar' name + getvar' name = fromMaybe "" $ M.lookup name vars justify p s | p > 0 = take (p - length s) spaces ++ s | p < 0 = s ++ take (-1 * (length s + p)) spaces @@ -73,9 +83,9 @@ gen = filter (not . empty) . fuse [] . scan [] . decode_c invar f var [] = Const (novar var) : f invar f var (c:cs) | c == '}' = foundvar f var 0 cs - | isAlphaNum c = invar f (c:var) cs + | isAlphaNum c || c == '_' = invar f (c:var) cs | c == ';' = inpad "" f var cs - | otherwise = scan ((Const $ reverse $ novar $ c:var):f) cs + | otherwise = scan ((Const $ novar $ c:var):f) cs inpad p f var (c:cs) | c == '}' = foundvar f var (readpad $ reverse p) cs @@ -127,7 +137,7 @@ decode_c s = unescape ("", s) echar a = a handle n = ("", n) -{- Should not need to use this, except for testing decode_c. -} +{- Inverse of decode_c. -} encode_c :: FormatString -> FormatString encode_c s = concatMap echar s where @@ -141,17 +151,23 @@ encode_c s = concatMap echar s echar '\v' = e 'v' echar '\\' = e '\\' echar '"' = e '"' - echar x - | ord x < 0x20 = e_num x -- low ascii - | ord x >= 256 = e_utf x - | ord x > 0x7E = e_num x -- high ascii - | otherwise = [x] -- printable ascii - where - showoctal i = '\\' : printf "%03o" i - e_num c = showoctal $ ord c - -- unicode character is decomposed to - -- Word8s and each is shown in octal - e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8]) + echar c + | ord c < 0x20 = e_asc c -- low ascii + | ord c >= 256 = e_utf c + | ord c > 0x7E = e_asc c -- high ascii + | otherwise = [c] -- printable ascii + +-- unicode character is decomposed to individual Word8s, +-- and each is shown in octal +e_utf :: Char -> String +e_utf c = showoctal . toInteger =<< + (Codec.Binary.UTF8.String.encode [c] :: [Word8]) + +e_asc :: Char -> String +e_asc c = showoctal $ toInteger $ ord c + +showoctal :: Integer -> String +showoctal i = '\\' : printf "%03o" i {- for quickcheck -} prop_idempotent_deencode :: String -> Bool diff --git a/doc/git-annex.mdwn b/doc/git-annex.mdwn index 7ad3fac69e..2d0d2597ea 100644 --- a/doc/git-annex.mdwn +++ b/doc/git-annex.mdwn @@ -437,8 +437,10 @@ subdirectories). Specifies a custom output format. The value is a format string, in which '${var}' is expanded to the value of a variable. To right-justify a variable with whitespace, use '${var;width}' ; to left-justify - a variable, use '${var;-width}'. Also, '\\n' is a newline, '\\000' is a NULL, - etc. + a variable, use '${var;-width}'; to escape unusual characters in a variable, + use '${escaped_var}' + + Also, '\\n' is a newline, '\\000' is a NULL, etc. * -c name=value diff --git a/doc/tips/finding_duplicate_files.mdwn b/doc/tips/finding_duplicate_files.mdwn new file mode 100644 index 0000000000..94fc85400e --- /dev/null +++ b/doc/tips/finding_duplicate_files.mdwn @@ -0,0 +1,21 @@ +Maybe you had a lot of files scattered around on different drives, and you +added them all into a single git-annex repository. Some of the files are +surely duplicates of others. + +While git-annex stores the file contents efficiently, it would still +help in cleaning up this mess if you could find, and perhaps remove +the duplicate files. + +Here's a command line that will show duplicate sets of files grouped together: + + git annex find --include '*' --format='${file} ${escaped_key}\n' | \ + sort -k2 | uniq --all-repeated=separate -f1 | \ + sed 's/ [^ ]*$//' + +Here's a command line that will remove one of each duplicate set of files: + + git annex find --include '*' --format='${file} ${escaped_key}\n' | \ + sort -k2 | uniq --repeated -f1 | sed 's/ [^ ]*$//' | \ + xargs -d '\n' git rm + +--[[Joey]] diff --git a/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn b/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn index ca18afc578..9336535788 100644 --- a/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn +++ b/doc/todo/wishlist:_Provide_a___34__git_annex__34___command_that_will_skip_duplicates.mdwn @@ -25,4 +25,4 @@ I want this because I have copies of various of mine (photos, in particular) sca (As I write this, I realize it's possible to parse the destination of the symlink in a way that does this..) -> +> [[done]]; see [[tips/finding_duplicate_files]] --[[Joey]]