add escape_var hack
Makes it easy to find files with duplicate contents, anyway.. :)
This commit is contained in:
parent
13a0c292b3
commit
7227dd8f21
4 changed files with 58 additions and 19 deletions
|
@ -18,6 +18,8 @@ import Text.Printf (printf)
|
||||||
import Data.Char (isAlphaNum, isOctDigit, chr, ord)
|
import Data.Char (isAlphaNum, isOctDigit, chr, ord)
|
||||||
import Data.Maybe (fromMaybe)
|
import Data.Maybe (fromMaybe)
|
||||||
import Data.Word (Word8)
|
import Data.Word (Word8)
|
||||||
|
import Data.List (isPrefixOf)
|
||||||
|
import Data.String.Utils (replace)
|
||||||
import qualified Codec.Binary.UTF8.String
|
import qualified Codec.Binary.UTF8.String
|
||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
|
|
||||||
|
@ -42,8 +44,16 @@ format :: Format -> M.Map String String -> String
|
||||||
format f vars = concatMap expand f
|
format f vars = concatMap expand f
|
||||||
where
|
where
|
||||||
expand (Const s) = s
|
expand (Const s) = s
|
||||||
expand (Var name padding) = justify padding $
|
expand (Var name padding) = justify padding $ getvar name
|
||||||
fromMaybe "" $ M.lookup name vars
|
getvar name
|
||||||
|
| "escaped_" `isPrefixOf` name =
|
||||||
|
-- escape whitespace too
|
||||||
|
replace " " (e_asc ' ') $
|
||||||
|
replace "\t" (e_asc '\t') $
|
||||||
|
encode_c $
|
||||||
|
getvar' $ drop (length "escaped_") name
|
||||||
|
| otherwise = getvar' name
|
||||||
|
getvar' name = fromMaybe "" $ M.lookup name vars
|
||||||
justify p s
|
justify p s
|
||||||
| p > 0 = take (p - length s) spaces ++ s
|
| p > 0 = take (p - length s) spaces ++ s
|
||||||
| p < 0 = s ++ take (-1 * (length s + p)) spaces
|
| p < 0 = s ++ take (-1 * (length s + p)) spaces
|
||||||
|
@ -73,9 +83,9 @@ gen = filter (not . empty) . fuse [] . scan [] . decode_c
|
||||||
invar f var [] = Const (novar var) : f
|
invar f var [] = Const (novar var) : f
|
||||||
invar f var (c:cs)
|
invar f var (c:cs)
|
||||||
| c == '}' = foundvar f var 0 cs
|
| c == '}' = foundvar f var 0 cs
|
||||||
| isAlphaNum c = invar f (c:var) cs
|
| isAlphaNum c || c == '_' = invar f (c:var) cs
|
||||||
| c == ';' = inpad "" f var cs
|
| c == ';' = inpad "" f var cs
|
||||||
| otherwise = scan ((Const $ reverse $ novar $ c:var):f) cs
|
| otherwise = scan ((Const $ novar $ c:var):f) cs
|
||||||
|
|
||||||
inpad p f var (c:cs)
|
inpad p f var (c:cs)
|
||||||
| c == '}' = foundvar f var (readpad $ reverse p) cs
|
| c == '}' = foundvar f var (readpad $ reverse p) cs
|
||||||
|
@ -127,7 +137,7 @@ decode_c s = unescape ("", s)
|
||||||
echar a = a
|
echar a = a
|
||||||
handle n = ("", n)
|
handle n = ("", n)
|
||||||
|
|
||||||
{- Should not need to use this, except for testing decode_c. -}
|
{- Inverse of decode_c. -}
|
||||||
encode_c :: FormatString -> FormatString
|
encode_c :: FormatString -> FormatString
|
||||||
encode_c s = concatMap echar s
|
encode_c s = concatMap echar s
|
||||||
where
|
where
|
||||||
|
@ -141,17 +151,23 @@ encode_c s = concatMap echar s
|
||||||
echar '\v' = e 'v'
|
echar '\v' = e 'v'
|
||||||
echar '\\' = e '\\'
|
echar '\\' = e '\\'
|
||||||
echar '"' = e '"'
|
echar '"' = e '"'
|
||||||
echar x
|
echar c
|
||||||
| ord x < 0x20 = e_num x -- low ascii
|
| ord c < 0x20 = e_asc c -- low ascii
|
||||||
| ord x >= 256 = e_utf x
|
| ord c >= 256 = e_utf c
|
||||||
| ord x > 0x7E = e_num x -- high ascii
|
| ord c > 0x7E = e_asc c -- high ascii
|
||||||
| otherwise = [x] -- printable ascii
|
| otherwise = [c] -- printable ascii
|
||||||
where
|
|
||||||
showoctal i = '\\' : printf "%03o" i
|
-- unicode character is decomposed to individual Word8s,
|
||||||
e_num c = showoctal $ ord c
|
-- and each is shown in octal
|
||||||
-- unicode character is decomposed to
|
e_utf :: Char -> String
|
||||||
-- Word8s and each is shown in octal
|
e_utf c = showoctal . toInteger =<<
|
||||||
e_utf c = showoctal =<< (Codec.Binary.UTF8.String.encode [c] :: [Word8])
|
(Codec.Binary.UTF8.String.encode [c] :: [Word8])
|
||||||
|
|
||||||
|
e_asc :: Char -> String
|
||||||
|
e_asc c = showoctal $ toInteger $ ord c
|
||||||
|
|
||||||
|
showoctal :: Integer -> String
|
||||||
|
showoctal i = '\\' : printf "%03o" i
|
||||||
|
|
||||||
{- for quickcheck -}
|
{- for quickcheck -}
|
||||||
prop_idempotent_deencode :: String -> Bool
|
prop_idempotent_deencode :: String -> Bool
|
||||||
|
|
|
@ -437,8 +437,10 @@ subdirectories).
|
||||||
Specifies a custom output format. The value is a format string,
|
Specifies a custom output format. The value is a format string,
|
||||||
in which '${var}' is expanded to the value of a variable. To right-justify
|
in which '${var}' is expanded to the value of a variable. To right-justify
|
||||||
a variable with whitespace, use '${var;width}' ; to left-justify
|
a variable with whitespace, use '${var;width}' ; to left-justify
|
||||||
a variable, use '${var;-width}'. Also, '\\n' is a newline, '\\000' is a NULL,
|
a variable, use '${var;-width}'; to escape unusual characters in a variable,
|
||||||
etc.
|
use '${escaped_var}'
|
||||||
|
|
||||||
|
Also, '\\n' is a newline, '\\000' is a NULL, etc.
|
||||||
|
|
||||||
* -c name=value
|
* -c name=value
|
||||||
|
|
||||||
|
|
21
doc/tips/finding_duplicate_files.mdwn
Normal file
21
doc/tips/finding_duplicate_files.mdwn
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
Maybe you had a lot of files scattered around on different drives, and you
|
||||||
|
added them all into a single git-annex repository. Some of the files are
|
||||||
|
surely duplicates of others.
|
||||||
|
|
||||||
|
While git-annex stores the file contents efficiently, it would still
|
||||||
|
help in cleaning up this mess if you could find, and perhaps remove
|
||||||
|
the duplicate files.
|
||||||
|
|
||||||
|
Here's a command line that will show duplicate sets of files grouped together:
|
||||||
|
|
||||||
|
git annex find --include '*' --format='${file} ${escaped_key}\n' | \
|
||||||
|
sort -k2 | uniq --all-repeated=separate -f1 | \
|
||||||
|
sed 's/ [^ ]*$//'
|
||||||
|
|
||||||
|
Here's a command line that will remove one of each duplicate set of files:
|
||||||
|
|
||||||
|
git annex find --include '*' --format='${file} ${escaped_key}\n' | \
|
||||||
|
sort -k2 | uniq --repeated -f1 | sed 's/ [^ ]*$//' | \
|
||||||
|
xargs -d '\n' git rm
|
||||||
|
|
||||||
|
--[[Joey]]
|
|
@ -25,4 +25,4 @@ I want this because I have copies of various of mine (photos, in particular) sca
|
||||||
|
|
||||||
(As I write this, I realize it's possible to parse the destination of the symlink in a way that does this..)
|
(As I write this, I realize it's possible to parse the destination of the symlink in a way that does this..)
|
||||||
|
|
||||||
>
|
> [[done]]; see [[tips/finding_duplicate_files]] --[[Joey]]
|
||||||
|
|
Loading…
Reference in a new issue