Merge branch 'master' into v8

2020-01-01 14:26:43 -04:00 · 2020-01-01 14:26:43 -04:00 · 2cea674d1e
commit 2cea674d1e
parent 3cd3757236 503788238c
44 changed files with 665 additions and 140 deletions
--- a/Annex/CatFile.hs
+++ b/Annex/CatFile.hs
@ -1,6 +1,6 @@
 {- git cat-file interface, with handle automatically stored in the Annex monad
 -
- - Copyright 2011-2018 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2019 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -16,6 +16,7 @@ module Annex.CatFile (
 	catObjectMetaData,
 	catFileStop,
 	catKey,
+	catKey',
 	catSymLinkTarget,
 	catKeyFile,
 	catKeyFileHEAD,
@ -54,7 +55,7 @@ catObject ref = do
 	h <- catFileHandle
 	liftIO $ Git.CatFile.catObject h ref

-catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
+catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
 catObjectMetaData ref = do
 	h <- catFileHandle
 	liftIO $ Git.CatFile.catObjectMetaData h ref
@ -99,14 +100,16 @@ catFileStop = do

 {- From ref to a symlink or a pointer file, get the key. -}
 catKey :: Ref -> Annex (Maybe Key)
-catKey ref = go =<< catObjectMetaData ref
-  where
-	go (Just (sz, _))
+catKey ref = catKey' ref =<< catObjectMetaData ref
+
+catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key)
+catKey' ref (Just (_, sz, _))
 	-- Avoid catting large files, that cannot be symlinks or
 	-- pointer files, which would require buffering their
 	-- content in memory, as well as a lot of IO.
-		| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
-	go _ = return Nothing
+	| sz <= maxPointerSz =
+		parseLinkTargetOrPointer . L.toStrict <$> catObject ref
+catKey' _ _ = return Nothing

 {- Gets a symlink target. -}
 catSymLinkTarget :: Sha -> Annex RawFilePath
@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f
 catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key) 
 catKeyFileHidden = hiddenCat catKey

-catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType))
+catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType))
 catObjectMetaDataHidden = hiddenCat catObjectMetaData

 hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)
--- a/Annex/HashObject.hs
+++ b/Annex/HashObject.hs
@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
 hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
  where
 	startup = do
-		h <- inRepo $ Git.HashObject.hashObjectStart
+		h <- inRepo $ Git.HashObject.hashObjectStart True
 		Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
 		return h

--- a/Build/DistributionUpdate.hs
+++ b/Build/DistributionUpdate.hs
@ -125,7 +125,7 @@ makeinfos updated version = do
 	now <- liftIO getCurrentTime
 	liftIO $ putStrLn $ "building info files"
 	forM_ updated $ \(f, bv) -> do
-		v <- lookupFile f
+		v <- lookupFile (toRawFilePath f)
 		case v of
 			Nothing -> noop
 			Just k -> whenM (inAnnex k) $ do
@ -153,6 +153,7 @@ makeinfos updated version = do
 		, Param "move"
 		, Param "--to"
 		, Param "website"
+		, Param "--force"
 		]
 	void $ inRepo $ runBool
 		[ Param "annex"
--- a/16
+++ b/16
@ -25,7 +25,15 @@ git-annex (8.20191107) UNRELEASED; urgency=medium

 -- Joey Hess <id@joeyh.name>  Tue, 29 Oct 2019 15:13:03 -0400

-git-annex (7.20191219) UNRELEASED; urgency=medium
+git-annex (7.20191231) UNRELEASED; urgency=medium
+
+  * add: --force-annex/--force-git options make it easier to override
+    annex.largefiles configuration (and potentially safer as it avoids
+    bugs like the smudge bug fixed in the last release).
+
+ -- Joey Hess <id@joeyh.name>  Wed, 01 Jan 2020 12:51:40 -0400
+
+git-annex (7.20191230) upstream; urgency=medium

  * Optimised processing of many files, especially by commands like find
    and whereis that only report on the state of the repository. Commands
@ -46,8 +54,12 @@ git-annex (7.20191219) UNRELEASED; urgency=medium
    setting, except for in the special case of annex.securehashesonly.
  * Improve file ordering behavior when one parameter is "." and other
    parameters are other directories.
+  * smudge bugfix: When annex.largefiles=anything, files that were already
+    stored in git, and have not been modified could sometimes be converted
+    to being stored in the annex. Changes in 7.20191024 made this more
+    of a problem. This case is now detected and prevented.

- -- Joey Hess <id@joeyh.name>  Wed, 18 Dec 2019 15:12:40 -0400
+ -- Joey Hess <id@joeyh.name>  Mon, 30 Dec 2019 12:43:30 -0400

 git-annex (7.20191218) upstream; urgency=medium

--- a/Command/Add.hs
+++ b/Command/Add.hs
@ -1,6 +1,6 @@
 {- git-annex command
 -
- - Copyright 2010-2019 Joey Hess <id@joeyh.name>
+ - Copyright 2010-2020 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -17,9 +17,13 @@ import qualified Database.Keys
 import Annex.FileMatcher
 import Annex.Link
 import Annex.Tmp
+import Annex.HashObject
 import Messages.Progress
+import Git.Types
 import Git.FilePath
 import Config.GitConfig
+import qualified Git.UpdateIndex
+import Utility.FileMode
 import qualified Utility.RawFilePath as R

 cmd :: Command
@ -32,6 +36,7 @@ data AddOptions = AddOptions
 	{ addThese :: CmdParams
 	, batchOption :: BatchMode
 	, updateOnly :: Bool
+	, largeFilesOverride :: Maybe Bool
 	}

 optParser :: CmdParamsDesc -> Parser AddOptions
@ -43,13 +48,24 @@ optParser desc = AddOptions
 		<> short 'u'
 		<> help "only update tracked files"
 		)
+	<*> (parseforcelarge <|> parseforcesmall)
+  where
+	parseforcelarge = flag Nothing (Just True)
+		( long "force-large"
+		<> help "add all files to annex, ignoring other configuration"
+		)
+	parseforcesmall = flag Nothing (Just False)
+		( long "force-small"
+		<> help "add all files to git, ignoring other configuration"
+		)

 seek :: AddOptions -> CommandSeek
 seek o = startConcurrency commandStages $ do
 	largematcher <- largeFilesMatcher
 	addunlockedmatcher <- addUnlockedMatcher
 	annexdotfiles <- getGitConfigVal annexDotFiles 
-	let gofile file =
+	let gofile file = case largeFilesOverride o of
+		Nothing -> 
 			let file' = fromRawFilePath file
 			in ifM (pure (annexdotfiles || not (dotfile file')) <&&> (checkFileMatcher largematcher file' <||> Annex.getState Annex.force))
 				( start file addunlockedmatcher
@ -58,6 +74,8 @@ seek o = startConcurrency commandStages $ do
 					, stop
 					)
 				)
+		Just True -> start file addunlockedmatcher
+		Just False -> startSmallOverridden file
 	case batchOption o of
 		Batch fmt
 			| updateOnly o ->
@ -81,6 +99,29 @@ addSmall file = do
 	showNote "non-large file; adding content to git repository"
 	addFile file

+startSmallOverridden :: RawFilePath -> CommandStart
+startSmallOverridden file = starting "add" (ActionItemWorkTreeFile file) $
+	next $ addSmallOverridden file
+
+addSmallOverridden :: RawFilePath -> Annex Bool
+addSmallOverridden file = do
+	showNote "adding content to git repository"
+	let file' = fromRawFilePath file
+	s <- liftIO $ getFileStatus file'
+	if isSymbolicLink s
+		then addFile file 
+		else do
+			-- Can't use addFile because the clean filter will
+			-- honor annex.largefiles and it has been overridden.
+			-- Instead, hash the file and add to the index.
+			sha <- hashFile file'
+			let ty = if isExecutable (fileMode s)
+				then TreeExecutable
+				else TreeFile
+			Annex.Queue.addUpdateIndex =<<
+				inRepo (Git.UpdateIndex.stageFile sha ty file')
+			return True
+
 addFile :: RawFilePath -> Annex Bool
 addFile file = do
 	ps <- forceParams
--- a/Command/Smudge.hs
+++ b/Command/Smudge.hs
@ -18,7 +18,10 @@ import Logs.Location
 import qualified Database.Keys
 import qualified Git.BuildVersion
 import Git.FilePath
+import Git.Types
+import Git.HashObject
 import qualified Git
+import qualified Git.Ref
 import qualified Annex
 import Backend
 import Utility.Metered
@ -89,11 +92,14 @@ clean file = do
 			Just k -> do
 				getMoveRaceRecovery k (toRawFilePath file)
 				liftIO $ L.hPut stdout b
-			Nothing -> go b =<< catKeyFile (toRawFilePath file)
+			Nothing -> do
+				let fileref = Git.Ref.fileRef (toRawFilePath file)
+				indexmeta <- catObjectMetaData fileref
+				go b indexmeta =<< catKey' fileref indexmeta
 		)
 	stop
  where
-	go b oldkey = ifM (shouldAnnex file oldkey)
+	go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
 		( do
 			-- Before git 2.5, failing to consume all stdin here
 			-- would cause a SIGPIPE and crash it.
@ -154,17 +160,17 @@ clean file = do
 -- added to the annex, so will be added to git. But some heuristics
 -- are used to avoid bad behavior:
 --
-- If the index already contains the file, preserve its annexed/not annexed
-- state. This prevents accidental conversions.
+-- If the file is annexed in the index, keep it annexed.
+-- This prevents accidental conversions.
 --
 -- Otherwise, when the file's inode is the same as one that was used for
 -- annexed content before, annex it. This handles cases such as renaming an
 -- unlocked annexed file followed by git add, which the user naturally
 -- expects to behave the same as git mv.
-shouldAnnex :: FilePath -> Maybe Key -> Annex Bool
-shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
-	( checkmatcher checkheuristics
-	, checkheuristics
+shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool
+shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
+	( checkunchangedgitfile $ checkmatcher checkheuristics
+	, checkunchangedgitfile checkheuristics
 	)
  where
 	checkmatcher d
@ -186,6 +192,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
 		Nothing -> pure False
 		Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus

+	-- This checks for a case where the file had been added to git
+	-- previously, not to the annex before, and its content is not
+	-- changed, but git is running the clean filter again on it
+	-- (eg because its mtime or inode changed, or just because git feels
+	-- like it). Such a file should not be added to the annex, even if
+	-- annex.largefiles now matches it, because the content is not
+	-- changed.
+	checkunchangedgitfile cont = case (moldkey, indexmeta) of
+		(Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case
+			Just sz' | sz' == sz -> do
+				-- The size is the same, so the file
+				-- is not much larger than what was stored
+				-- in git before, so it won't be out of
+				-- line to hash it. However, the content
+				-- is prevented from being stored in git
+				-- when hashing.
+				h <- inRepo $ hashObjectStart False
+				sha' <- liftIO $ hashFile h file
+				liftIO $ hashObjectStop h
+				if sha' == sha
+					then return False
+					else cont
+			_ -> cont
+		_ -> cont
+
 emitPointer :: Key -> IO ()
 emitPointer = S.putStr . formatPointer

--- a/Git/CatFile.hs
+++ b/Git/CatFile.hs
@ -1,6 +1,6 @@
 {- git cat-file interface
 -
- - Copyright 2011-2018 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2019 Joey Hess <id@joeyh.name>
 -
 - Licensed under the GNU AGPL version 3 or higher.
 -}
@ -33,6 +33,7 @@ import Text.Read
 import Common
 import Git
 import Git.Sha
+import qualified Git.Ref
 import Git.Command
 import Git.Types
 import Git.FilePath
@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
 				return (Just (content, sha, objtype))

 {- Gets the size and type of an object, without reading its content. -}
-catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
+catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
 catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
 	resp <- hGetLine from
 	case parseResp object resp of
-		Just (ParsedResp _ size objtype) ->
-			return $ Just (size, objtype)
+		Just (ParsedResp sha size objtype) ->
+			return $ Just (sha, size, objtype)
 		Just DNE -> return Nothing
 		Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
  where
 	-- Slow fallback path for filenames containing newlines.
 	newlinefallback = do
+		sha <- Git.Ref.sha object (gitRepo h)
 		sz <- querySize object (gitRepo h)
 		objtype <- queryObjectType object (gitRepo h)
-		return $ (,) <$> sz <*> objtype
+		return $ (,,) <$> sha <*> sz <*> objtype

-data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
+data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE

 query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
 query hdl object newlinefallback receive
@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $
 			, return Nothing
 			)

-querySize :: Ref -> Repo -> IO (Maybe Integer)
+querySize :: Ref -> Repo -> IO (Maybe FileSize)
 querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
 	<$> querySingle (Param "-s") r repo hGetContentsStrict

--- a/Git/Filename.hs
+++ b/Git/Filename.hs
@ -46,8 +46,8 @@ prop_encode_decode_roundtrip s = s' ==
 	-- "\343\200\271".
 	--
 	-- This property papers over the problem, by only
-	-- testing chars < 256.
-	nohigh = filter (\c -> ord c < 256)
+	-- testing ascii
+	nohigh = filter isAscii
 	-- A String can contain a NUL, but toRawFilePath
 	-- truncates on the NUL, which is generally fine
 	-- because unix filenames cannot contain NUL.
--- a/Git/HashObject.hs
+++ b/Git/HashObject.hs
@ -23,12 +23,12 @@ import Data.ByteString.Builder

 type HashObjectHandle = CoProcess.CoProcessHandle

-hashObjectStart :: Repo -> IO HashObjectHandle
-hashObjectStart = gitCoProcessStart True
-	[ Param "hash-object"
-	, Param "-w"
-	, Param "--stdin-paths"
-	, Param "--no-filters"
+hashObjectStart :: Bool -> Repo -> IO HashObjectHandle
+hashObjectStart writeobject = gitCoProcessStart True $ catMaybes
+	[ Just (Param "hash-object")
+	, if writeobject then Just (Param "-w") else Nothing
+	, Just (Param "--stdin-paths")
+	, Just (Param "--no-filters")
 	]

 hashObjectStop :: HashObjectHandle -> IO ()
--- a/Git/Remote.hs
+++ b/Git/Remote.hs
@ -117,5 +117,5 @@ parseRemoteLocation s repo = ret $ calcloc s
 	-- git on Windows will write a path to .git/config with "drive:",
 	-- which is not to be confused with a "host:"
 	dosstyle = hasDrive
-	dospath = fromInternalGitPath
+	dospath = fromRawFilePath . fromInternalGitPath . toRawFilePath
 #endif
--- a/Git/UnionMerge.hs
+++ b/Git/UnionMerge.hs
@ -31,7 +31,7 @@ import Git.FilePath
 -}
 merge :: Ref -> Ref -> Repo -> IO ()
 merge x y repo = do
-	hashhandle <- hashObjectStart repo
+	hashhandle <- hashObjectStart True repo
 	ch <- catFileStart repo
 	streamUpdateIndex repo
 		[ lsTree x repo
--- a/Key.hs
+++ b/Key.hs
@ -81,10 +81,9 @@ instance Arbitrary KeyData where
 -- AssociatedFile cannot be empty, and cannot contain a NUL
 -- (but can be Nothing)
 instance Arbitrary AssociatedFile where
-	arbitrary = (AssociatedFile . fmap mk <$> arbitrary)
+	arbitrary = (AssociatedFile . fmap toRawFilePath <$> arbitrary)
 		`suchThat` (/= AssociatedFile (Just S.empty))
-	  where
-		mk = toRawFilePath . filter (/= '\NUL')
+		`suchThat` (\(AssociatedFile f) -> maybe True (S.notElem 0) f)

 instance Arbitrary Key where
 	arbitrary = mkKey . const <$> arbitrary
--- a/2
+++ b/2
@ -253,7 +253,7 @@ osxapp:
 				sleep 60; \
 			fi \
 		fi \
-	done
+	done; if [ $$ok = 0 ]; then exit 1; fi

 # Bypass cabal, and only run the main ghc --make command for a
 # faster development build.
--- a/Utility/Daemon.hs
+++ b/Utility/Daemon.hs
@ -8,7 +8,9 @@
 {-# LANGUAGE CPP #-}

 module Utility.Daemon (
+#ifndef mingw32_HOST_OS
 	daemonize,
+#endif
 	foreground,
 	checkDaemon,
 	stopDaemon,
--- a/Utility/FileSystemEncoding.hs
+++ b/Utility/FileSystemEncoding.hs
@ -43,7 +43,6 @@ import qualified Data.ByteString.Lazy as L
 import qualified Data.ByteString.UTF8 as S8
 import qualified Data.ByteString.Lazy.UTF8 as L8
 #endif
-import System.FilePath.ByteString (RawFilePath, encodeFilePath, decodeFilePath)

 import Utility.Exception
 import Utility.Split
@ -172,11 +171,21 @@ encodeBL' = L.pack . decodeW8
 encodeBL' = L8.fromString
 #endif

-fromRawFilePath :: RawFilePath -> FilePath
-fromRawFilePath = decodeFilePath
+{- Recent versions of the unix package have this alias; defined here
+ - for backwards compatibility. -}
+type RawFilePath = S.ByteString

+{- Note that the RawFilePath is assumed to never contain NUL,
+ - since filename's don't. This should only be used with actual
+ - RawFilePaths not arbitrary ByteString that may contain NUL. -}
+fromRawFilePath :: RawFilePath -> FilePath
+fromRawFilePath = decodeBS'
+
+{- Note that the FilePath is assumed to never contain NUL,
+ - since filename's don't. This should only be used with actual FilePaths
+ - not arbitrary String that may contain NUL. -}
 toRawFilePath :: FilePath -> RawFilePath
-toRawFilePath = encodeFilePath
+toRawFilePath = encodeBS'

 {- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
 -
--- a/Utility/Format.hs
+++ b/Utility/Format.hs
@ -15,7 +15,7 @@ module Utility.Format (
 ) where

 import Text.Printf (printf)
-import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord)
+import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii)
 import Data.Maybe (fromMaybe)
 import Data.Word (Word8)
 import Data.List (isPrefixOf)
@ -176,12 +176,12 @@ encode_c' p = concatMap echar
 {- For quickcheck. 
 -
 - Encoding and then decoding roundtrips only when
- - the string does not contain high unicode, because eg, 
- - both "\12345" and "\227\128\185" are encoded to "\343\200\271".
+ - the string is ascii because eg, both "\12345" and
+ - "\227\128\185" are encoded to "\343\200\271".
 -
- - This property papers over the problem, by only testing chars < 256.
+ - This property papers over the problem, by only testing ascii.
 -}
 prop_encode_c_decode_c_roundtrip :: String -> Bool
 prop_encode_c_decode_c_roundtrip s = s' == decode_c (encode_c s')
  where
-	s' = filter (\c -> ord c < 256) s
+	s' = filter isAscii s
--- a/Utility/RawFilePath.hs
+++ b/Utility/RawFilePath.hs
@ -31,6 +31,7 @@ doesPathExist = fileExist

 #else
 import qualified Data.ByteString as B
+import System.PosixCompat (FileStatus)
 import qualified System.PosixCompat as P
 import qualified System.Directory as D
 import Utility.FileSystemEncoding
--- a/debian/control
+++ b/debian/control
@ -78,6 +78,7 @@ Build-Depends:
 	libghc-magic-dev,
 	libghc-socks-dev,
 	libghc-vector-dev,
+	libghc-filepath-bytestring-dev,
 	lsof [linux-any],
 	ikiwiki,
 	libimage-magick-perl,
--- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file.mdwn
+++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file.mdwn
@ -75,3 +75,5 @@ Thanks for having a look.

 [[!meta author=kyle]]
 [[!tag projects/datalad]]
+
+> [[fixed|done]] --[[Joey]]
--- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_2_27faf0663265bd3855e98f7f1f382bdd._comment
+++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_2_27faf0663265bd3855e98f7f1f382bdd._comment
@ -0,0 +1,42 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 2"""
+ date="2019-12-27T06:22:23Z"
+ content="""
+On second thought, making the clean filter check for non-annexed files
+would prevent use cases like annex.largefiles=largerthan(100kb)
+from working as the user intended and letting a small file start out
+non-annexed and get annexed once it gets too large. Users certianly rely on
+that and this bug that only affects an edge case does not justify breaking
+that.
+
+What would work to make the clean filter detect when a file's content
+has not changed, though its mtime (or inode) has changed. In that case,
+it's reasonable for the clean filter to ignore annex.largefiles and keep
+the content represented in git however it already was (non-annexed or
+annexed).
+
+To detect that, in the case where the file in the index is not annexed:
+First check if the file size is the same as the
+size in the index. If it is, run git hash-object on the file, and see if
+the sha1 is the same as in the index. This avoids hashing any unusually
+large files, so the clean filter only gets a bit slower.
+
+And when the file in the index is annexed, check if the file size is the
+same as the size of the annexed key. If it is, verify if the file content
+matches the key. (typically be hashing). Cases where keys lack size or
+don't use a checksum could lead to false positives or negatives though.
+Although, I've not managed to find a version of this bug that makes an
+annexed file get converted to git unintentionally, so maybe this part does
+not need to be done?
+
+----
+
+Or.. Since the root of the problem is temporarily overriding annex.largefiles,
+it could just be documented that it's not a good idea to use 
+-c annex.largefiles=anything/nothing, because such broad overrides
+can affect other files than the ones you intended.
+(And since the documented methods of converting files from annexed to git and
+git to annexed use such overrides, that documentation would need to be
+changed.)
+"""]]
--- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_3_37fb6d6361068f57c2068270ceb72ac0._comment
+++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_3_37fb6d6361068f57c2068270ceb72ac0._comment
@ -0,0 +1,16 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 3"""
+ date="2019-12-27T17:11:42Z"
+ content="""
+A variant of this where an annexed unlocked file is added first,
+then the file is touched, and then some other file is added
+with -c annex.largefiles=nothing does result in the clean filter sending
+the whole annexed file content back to git, rather than keeping it annexed.
+For whatever reason, git does not store that content in .git/objects or
+update the index for that file though, so it doesn't show up as a change.
+
+So *apparently* that variant is only potentially an expensive cat of a
+large annexed file, and does not need to be dealt with. Unless git
+sometimes behaves otherwise.
+"""]]
--- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_4_4c3b233aa1c6b0b6c0b0d7519b5877a1._comment
+++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_4_4c3b233aa1c6b0b6c0b0d7519b5877a1._comment
@ -0,0 +1,45 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 4"""
+ date="2019-12-27T18:41:12Z"
+ content="""
+It's almost possible to get the same unwanted conversion without any git
+races:
+
+	echo content-git > file-git
+	sleep 2
+	git add file-git
+	git commit -m add
+
+	echo foo > file-git
+	echo content-annex > file-annex
+	git -c annex.largefiles=anything annex add file-annex
+
+In this case, git currently does not run the modified file-git through the
+clean filter in the last line, so the annex.largefiles=anything doesn't
+affect it.
+
+But, as far as I can see, there's nothing preventing a future version
+of git from deciding it does want to run file-git through the clean filter
+in this case.
+
+I am not going to try to prevent against such a thing happening.
+As far as I can see, anything that the clean filter can possibly do to
+avoid such a situation will cripple existing uses cases of
+annex.largefiles, like largerthan() as mentioned above.
+The user has told git-annex to annex "anything", and if git
+decides to run the clean filter while that is in effect, caveat emptor.
+
+Which is not to say I'm not going to fix the specific case this bug was
+filed about. I actually have a fix developed now. But just to say that
+setting annex.largefiles=anything/nothing temporarily is a blunt instrument,
+and you risk accidental conversion when using it, and so it would be a good
+idea to not do that.
+
+One idea: Make `git-annex add --annex` and `git-annex add --git`
+add a specific file to annex or git, bypassing annex.largefiles and all
+other configuration and state. This could also be used to easily switch
+a file from one storage to the other. I'd hope the existence of that
+would prevent one-off setting of annex.largefiles=anything/nothing.
+[[todo/git_annex_add_option_to_control_to_where]]
+"""]]
--- a/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_5_d1d23d3884fcbef3c8715ebdf8791f01._comment
+++ b/doc/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/comment_5_d1d23d3884fcbef3c8715ebdf8791f01._comment
@ -0,0 +1,58 @@
+[[!comment format=mdwn
+ username="kyle"
+ avatar="http://cdn.libravatar.org/avatar/7d6e85cde1422ad60607c87fa87c63f3"
+ subject="comment 5"
+ date="2019-12-28T21:06:46Z"
+ content="""
+Thanks for the explanation and the fix.
+
+> For whatever reason, git becomes confused about whether this file is
+> modified. I seem to recall that git distrusts information it recorded in
+> its own index if the mtime of the index file is too close to the
+> mtime recorded inside it, or something like that.
+
+I see.  I think the problem and associated workaround you're referring
+to is described in git's Documentation/technical/racy-git.txt.
+
+> Note that, you can accomplish the same thing without setting
+> annex.largefiles, assuming a current version of git-annex:
+>
+> 	git add file-git
+> 	git annex add file-annex
+>
+> I think the only reason for setting annex.largefiles in either of the two
+> places you did is if there's a default value that you want to
+> temporarily override?
+
+Right.  DataLad's methods that are responsible for calling out to `git
+annex add` have a `git={None,False,True}` parameter.  By default
+(`None`), DataLad just calls `git annex add ...` and let's any
+configuration in the repo control whether the file goes to git or is
+annexed.  But with `git=True` or `git=False`, the `annex add` call
+includes a `-c annex.largefiles=` argument with a value of `nothing`
+or `anything`, respectively.
+
+> But just to say that setting annex.largefiles=anything/nothing
+> temporarily is a blunt instrument, and you risk accidental
+> conversion when using it, and so it would be a good idea to not do
+> that.
+
+Noted.  As mentioned above, DataLad's default behavior is to honor the
+repo's `annex.largefiles` configuration.  And the documentation for
+`datalad save`, DataLad's main user-facing entry point for `annex
+add`, recommends that the user configure .gitattributes rather than
+using the option that leads calling `annex add` with `-c
+annex.largefiles=nothing`.
+
+> One idea: Make `git-annex add --annex` and `git-annex add --git`
+> add a specific file to annex or git, bypassing annex.largefiles and all
+> other configuration and state. This could also be used to easily switch
+> a file from one storage to the other. I'd hope the existence of that
+> would prevent one-off setting of annex.largefiles=anything/nothing.
+
+As far as I can see, those flags would completely cover DataLad's
+one-off setting of `annex.largefiles=anything/nothing`.  They map
+directly to DataLad's `git=False/True` option described above.  So,
+from DataLad's perspective, they'd be very useful and welcome.
+
+"""]]
--- a/doc/bugs/Packfile_does_not_match_digest58_gcrypt_with_assistant/comment_15_e8deee63316c8af39d7cf0996d26ecba._comment
+++ b/doc/bugs/Packfile_does_not_match_digest58_gcrypt_with_assistant/comment_15_e8deee63316c8af39d7cf0996d26ecba._comment
@ -0,0 +1,34 @@
+[[!comment format=mdwn
+ username="sirio@84e81889437b3f6208201a26e428197c6045c337"
+ nickname="sirio"
+ avatar="http://cdn.libravatar.org/avatar/9f3a0cfaf4825081710b652cc0b438a4"
+ subject="Duplicate 'gcrypt-id' may be the issue?"
+ date="2019-12-29T22:10:26Z"
+ content="""
+Had a repo exhibit this behavior just now:
+
+- commit graph `XX -> YY`
+- host `A` @ commit `YY`
+- host `B` @ commit `XX` (1 behind)
+- remotes `hub` and `lab` both @ commit `XX`
+- `B` pushes and pulls from both `hub` and `lab`: OK
+- `A` pushes to `hub` (updates to commit `YY`): OK
+- `B` pulls from `hub`: FAIL with *Packfile does not match digest*
+- `B` pulls from `lab`: OK
+- `B` pushes to `hub`: FAIL with *Packfile does not match digest*
+- `A` pulls from `hub`: OK
+- `A` pulls from `lab`: OK
+
+When looking in `.git/config` I noticed that `remote.hub.gcrypt-id` and `remote.lab.gcrypt-id` were identical.
+
+To fix, I:
+
+- removed `remote.hub.gcrypt-id` from `.git/config` on both `A` and `B`
+- deleted and re-created a blank repo on `hub`
+- `git push hub` on `B`
+- `git pull hub master` on `A`
+
+This resulted in a new and unique value for `remote.hub.gcrypt-id`, which is the same on both `A` and `B`.
+
+Have not had time to dig into why but this is the only thread I can find about this problem so I figured I would log this somewhere for posterity.
+"""]]
--- a/doc/bugs/assistant_not_synching_with_content.mdwn
+++ b/doc/bugs/assistant_not_synching_with_content.mdwn
@ -0,0 +1,97 @@
+### Please describe the problem.
+
+I have the following repos
+
+a - group manual - all content currently originates on this repo (OS X 10.14.4)
+b - group backup - this is a rclone special backed by google drive
+c - this is the underlying git repo on gitlab.com
+d - group backup - a server that is supposed to backup everything (OS X 10.14.4)
+
+Assistant is running on a and d
+
+It is not guaranteed that a and d will be able to directly connect, however, they both have very good connectivity to b and c
+
+When I add a set of files into a (using git-annex add) the non-annex files get checked into the git repo and pushed to c.  Similarly, the content (annex files) get pushed to b.  This is confirmed by git-anenx list --allrepos
+
+Within an hour or so, d will know about the files that were added (git-annex list) and the git log shows that it is on the same commit as a and c.
+
+However, the assistant on d never does the git-annex sync --content
+
+If I manually run git-annex sync --content on d, all is updated as expected.
+
+I've made no changes to the groupwants, group, etc. settings
+
+### What steps will reproduce the problem?
+
+create a repo with a central git upstream and a special via rclone on gdrive.  Clone that repo in another machine that can also see the upstream and special, but isn't directly connected to the originator of the repo
+
+Add annex-handled files to the original repo.
+
+Check the status of the git upstream, special, and the clone.
+
+After failure is acknowledged, run git annex sync --content to confirm that the mechanics still work
+
+### What version of git-annex are you using? On what operating system?
+
+Both hosts are OSX 10.14.4 and are running 7.20191218
+
+### Please provide any additional information below.
+
+This is from the assistant on the clone.  It is running in debug mode.
+
+[[!format sh """
+
+[2019-12-30 17:44:09.362492] main: starting assistant version 7.20191114
+[2019-12-30 17:44:14.532638] TransferScanner: Syncing with origin
+(scanning...) [2019-12-30 17:44:14.590159] Watcher: Performing startup scan
+ControlSocket .git/annex/ssh/git@gitlab already exists, disabling multiplexing
+Disallowed command
+Everything up-to-date
+Disallowed command
+Disallowed command
+Disallowed command
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
+
+  git cat-file EOF: user error
+
+  fd:38: hFlush: resource vanished (Broken pipe)
+
+  fd:38: hFlush: resource vanished (Broken pipe)
+Disallowed command
+(started...)
+[2019-12-30 17:44:33.097035] Committer: Committing changes to git
+(recording state in git...)
+[2019-12-30 17:44:33.176213] Pusher: Syncing with origin
+
+
+
+
+
+Everything up-to-date
+Disallowed command
+
+<<A bunch of white space lines removed for brevity>>
+
+Disallowed command
+Disallowed command
+Disallowed command
+Disallowed command
+Disallowed command
+# End of transcript or log.
+"""]]
+
+### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders)
+
+Yes - I can run this manually, and overall this is great - I would just love to get this automated....
+
+
--- a/doc/bugs/be_robust_to_additions_of_more_specific_mime_types_to_libmagic_40e.g._application47json41.mdwn
+++ b/doc/bugs/be_robust_to_additions_of_more_specific_mime_types_to_libmagic_40e.g._application47json41.mdwn
@ -20,3 +20,5 @@ If strict matching (not sure yet about a use case where it would really be neede

 [[!meta author=yoh]]
 [[!tag projects/datalad]]
+
+> Looks like we're agreed this is not necessary, so [[done]] --[[Joey]]
--- a/doc/bugs/enable-tor_unsupported_on_osx.mdwn
+++ b/doc/bugs/enable-tor_unsupported_on_osx.mdwn
@ -0,0 +1,30 @@
+### Please describe the problem.
+
+enable-tor on an OSX box (with magic-wormhole and tor installed via brew) fails miserably.
+
+### What steps will reproduce the problem?
+
+run git-annex enable-tor - multiple fails, see details.
+
+
+### What version of git-annex are you using? On what operating system?
+
+7.20191106
+
+OSX 10.14.5
+
+### Please provide any additional information below.
+
+The first failure is that enable-tor can't run as root.  Instead, I call it with sudo git-annex enable-tor <UID>
+
+The second failure is that you try and write into /etc/tor/torrc - which is not where torrc is located on a brew installed tor - it's in /usr/local/etc/tor/torrc.  I made a symlink to get around that problem.
+
+The third failure is a complaint about systemctl not being present.  I looked in Utilities/Tor.hc and saw you were trying to call for a reload of tor.  To hack around that, I wrote a script called systemctl that simply called 'brew services' with the args passed in ( brew services $1 $2 ).  
+
+After that, I still get the error: git-annex: tor failed to create hidden service, perhaps the tor service is not running
+
+I have restarted tor manually, and it is indeed running.  It looks like something is failing in setting up the Onion socket, but I can't see what
+
+### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders)
+
+I love it - using it to protect my photo archive now using a central special repo (rclone) for the data, and a gitlab repo for the base.  
--- a/doc/forum/Balanced_Parity.mdwn
+++ b/doc/forum/Balanced_Parity.mdwn
@ -0,0 +1,41 @@
+I am interested in using or extending git-annex to manage storage redundancy more efficiently than [[git-annex-numcopies]] allows.  In particular, I wish to be able to tolerate the loss of a repository when I don't have enough disk space for `numcopies 2` -- when my disk capacity is less than double the data size.
+
+
+For example, suppose I wish to store N = 1000 GB on k = 10 servers, each with 150 GB storage (so total storage is 1500 GB, which is less than the 2000 GB that would be required for `numcopies 2`).  For simplicity here, we'll presume that our data is in 1000 1GB files.
+
+We'll set the goal of being able to lose r = 3 servers without losing any data (which would reduce our total storage capacity to 7 * 150 GB = 1050 GB).
+
+This can be done by thinking of our files in groups of seven (k - r) and using parchive2 or similar to create 3 (r) parity files for each set:
+
+```
+Parity group 000: D000 D001 D002 D003 D004 D005 D006  P000.0 P000.1 P000.2
+Parity group 001: D007 D008 D009 D010 D011 D012 D013  P001.0 P001.1 P001.2
+...
+Parity group 141: D987 D988 D989 D990 D991 D992 D993  P141.0 P141.1 P141.2
+Parity group 142: D994 D995 D996 D997 D998 D999       P142.0 P142.1 P142.2
+```
+
+All files are still 1G -- we have used 1429 GB of storage, far less than the 4000 GB that would be required to achieve the same durability with `numcopies 4`.
+
+The trouble is that we now need to ensure that each parity group's files are scattered across all the repositories.  D001 must go in a different repository than D000.  D0002 must go in a different repository than the first two, etc.  git-annex's [[required_content]] mechanism allows allocating files to repositories based on properties of that file, not the presence or absence of other files.
+
+
+Generalizing to differently-sized files, differently-sized repositories, parity group sizes other than k, k changing over time, per-file r values, and trying to minimize churn, it seems to me that the simplest way to do this would be to run a solver that finds a 'best' allocation plan and then writes a metadata tag on every file with its intended home.  Each repository can then have a simple [[required_content]] rule with its own name and [[--auto|walkthrough/automatically_managing_content]] or the assistant can move the files around as necessary.  I'm imagining running the solver periodically, from a cron job or in the assistant.  The other moving part is launching `par2` to create parity files do recovery, which happens after the solver has commanded that all the relevant inputs be brought together on one machine and after `get --auto` invocations or the assistant have done the transfers.
+
+User interface points:
+
+1. User specifies how much disk space the solver should use in each repository.
+2. User specifies the desired durability of each file.  Probably with match expressions?
+3. User declares a repository lost with [[git-annex-dead]].  The solver will notice many degraded parity groups on the next run & emit a plan to start bringing together parity groups' files for `par2 r` runs.  Ideally, running `git-anenx-dead` would would stop any existing solver run & start a fresh solver run immediately.
+4. Report to user parity placement health (are all the files where they should be?  Is recovery finished?  About how much I/O needs to be done to finish?)
+5. Report to user the available storage space for additional content at each durability level (this is non-trivial given differently-sized repositories)
+6. Report to user the storage efficiency of the current arrangement, how much storage efficiency could be improved at various I/O costs, and allowing the user to select a (eg: re-grouping old and recently-written files into new parity groups would allow closer file-size matching, allowing smaller parity files for the smaller file set.)
+7. User specifies parity group size??  This is a trade-off between storage efficiency (larger groups → more efficient) and high-durability storage capacity when repository sizes differ (smaller groups AND repository size diversity → more high-durability storage capacity).  It's possible that there's some principled way to save the user the trouble of having to understand the mechanics of parity groups & choose a parity group size, but I'm unclear on how that would work, so my initial plan is to have the user specify this by the same mechanism that they specify desired durability.  A good UI could support this choice by showing what the effect of each option would be on storage capacity at each durability level. 
+
+
+See also
+
+* [[todo/Wishlist__58___Parity_files_on_all_files]]
+* [[todo/wishlist__58___Parity_files_for_encrypted_remotes]]
+* [[design/assistant/partial_content]]
+* [[forum/sparse_git_checkouts_with_annex]]
--- a/doc/forum/Same_file_sent_to_a_webdav_remote_in_transfer_group_multiple_time_during_syncing.mdwn
+++ b/doc/forum/Same_file_sent_to_a_webdav_remote_in_transfer_group_multiple_time_during_syncing.mdwn
@ -0,0 +1,8 @@
+Suppose there are three repo: A, B and transfer, transfer is a webdav remote.
+
+1. I add file-1 to A, do a sync content, file-1 is sent to transfer.
+2. I add file-2 to A, do a sync content, I expect only file-2 to be sent to transfer, but file-1 is sent another time. 
+
+How can I avoid that?
+
+Does other special remote like the rsync special remote has this issue?
--- a/doc/git-annex-add.mdwn
+++ b/doc/git-annex-add.mdwn
@ -36,6 +36,17 @@ annexed content, and other symlinks.

  Add gitignored files.

+* `--force-large`
+
+  Treat all files as large files, ignoring annex.largefiles and annex.dotfiles
+  configuration, and add to the annex.
+
+* `--force-small`
+
+  Treat all files as small files, ignoring annex.largefiles and annex.dotfiles
+  configuration, and add to git, also ignoring annex.addsmallfiles
+  configuration.
+
 * `--backend`

  Specifies which key-value backend to use.
--- a/doc/git-annex-add/comment_1_3286fb304f161df9775366db27cf9530._comment
+++ b/doc/git-annex-add/comment_1_3286fb304f161df9775366db27cf9530._comment
@ -1,12 +0,0 @@
-[[!comment format=mdwn
- username="rrnewton@63c9faa1997c908b1dc04dfdca33c809660cd158"
- nickname="rrnewton"
- avatar="http://cdn.libravatar.org/avatar/638acc3e55c2bb09aa0dcca5b5c8acb6"
- subject="Flag to force same behavior as annex.largefiles attribute?"
- date="2018-05-21T05:29:06Z"
- content="""
-When in [direct mode](https://git-annex.branchable.com/direct_mode), the \"add the non-large file directly to the git repository\" behavior described above is very useful, because the option of typing simply `git add foo`, does not exist as it does in [indirect mode](https://git-annex.branchable.com/git-annex-indirect/).
-
-However, I can't see any combination of flags that trigger this behavior.  I suppose it can be accomplished by temporarily setting [annex.largefiles](https://git-annex.branchable.com/tips/largefiles/) to a huge value before executing `git annex add` (i.e. creating a `.gitattributes` and then deleting it).  I think I'll try that as a work-around, but it would be great to have a flag that accomplishes this.
-
-"""]]
--- a/doc/git-annex-add/comment_2_9bce0639d6c0767e39465db00f8120f1._comment
+++ b/doc/git-annex-add/comment_2_9bce0639d6c0767e39465db00f8120f1._comment
@ -1,12 +0,0 @@
-[[!comment format=mdwn
- username="joey"
- subject="""comment 2"""
- date="2018-05-21T16:36:51Z"
- content="""
-@rrnewton I know people do commonly accomplish this
-by something like `git -c annex.largefiles='exclude(*)' annex add` 
-
-A shorter way to write that would only be useful for direct mode,
-so I'm inclined not to add it, but open a todo item if you want to discuss
-that.
-"""]]
--- a/doc/git-annex-add/comment_3_352e8782fc9e4d02c069a527e8c88f40._comment
+++ b/doc/git-annex-add/comment_3_352e8782fc9e4d02c069a527e8c88f40._comment
@ -1,14 +0,0 @@
-[[!comment format=mdwn
- username="rrnewton@63c9faa1997c908b1dc04dfdca33c809660cd158"
- nickname="rrnewton"
- avatar="http://cdn.libravatar.org/avatar/638acc3e55c2bb09aa0dcca5b5c8acb6"
- subject="Sounds great!"
- date="2018-05-21T18:09:35Z"
- content="""
-That's fabulous.  A Bash alias around that command is really all I need when working in direct mode.  (And the archive's too damn big to switch back and forth between direct/indirect.)
-
-I was just too much a newb with git attributes to know it could be done that way.  For discoverability, maybe that command could be placed in an \"examples\" section in the primary documentation above?
-
-
-
-"""]]
--- a/doc/git-annex-add/comment_4_48e3f9438721b9696897bc6e5a41f4ac._comment
+++ b/doc/git-annex-add/comment_4_48e3f9438721b9696897bc6e5a41f4ac._comment
@ -1,8 +0,0 @@
-[[!comment format=mdwn
- username="timeless-ventricle"
- avatar="http://cdn.libravatar.org/avatar/0b220fa4c0b59e883f360979ee745d63"
- subject="comment 4"
- date="2019-01-06T12:24:49Z"
- content="""
-@joey I'm obviously missing something here, why would a shorter way to write that only be useful for direct mode?  I don't understand what the connection is between direct mode and wanting to specify whether this is a \"regular git\" file or an annexed file (except that direct mode is not supported in v7)?  I thought it was considered supported to have a mix of both large binary files and text files?  Even if some text files are large, I think I want to add them as files whose content is tracked by git, so I think I want to choose 'by hand' -- is that not really supported / considered a bad idea for some reason?
-"""]]
--- a/doc/git-annex-add/comment_5_47dbfaaaf5ac0b9f4f5300070a505f5b._comment
+++ b/doc/git-annex-add/comment_5_47dbfaaaf5ac0b9f4f5300070a505f5b._comment
@ -1,10 +0,0 @@
-[[!comment format=mdwn
- username="joey"
- subject="""comment 5"""
- date="2019-01-22T21:10:37Z"
- content="""
-Because "git add foo" does not work in direct mode.
-
-This is really not the place to be having a conversation about this. If you
-want something changed in git-annex, open a bug report or todo item.
-"""]]
--- a/doc/git-annex-add/comment_6_bdf11966ddf7b8575a9caa0f5e360e42._comment
+++ b/doc/git-annex-add/comment_6_bdf11966ddf7b8575a9caa0f5e360e42._comment
@ -1,16 +0,0 @@
-[[!comment format=mdwn
- username="johnmario.itec19@69a7b742534851b36216e0f951f1a00dbb9067cd"
- nickname="johnmario.itec19"
- avatar="http://cdn.libravatar.org/avatar/2f07ffce1656bdcd6aa19aaab7517975"
- subject="commenting on git-annex-add"
- date="2019-09-02T06:21:27Z"
- content="""
-Yes you can do that. Simplest way is to git add the files you want to directly be in the git repo (e.g. the source code) and git annex add the large files.
-
-You can then check in any changes to the source code files (or anything else you added with git add) to github as normal.
-
-You can manage the storage and versioning of the large files using git annex commands. Git annex supports using AWS S3 and/or glacier for backing up the files. It can also back them up to a server you control over ssh or to an external drive (or any combination of the above). http://git-annex.branchable.com/special_remotes/
-
-With the latest version of git annex, you can also set up automatically filters that decide which types/sizes of files to check in directly to git vs which ones to store as links in the annex. https://git-annex.branchable.com/tips/largefiles/
-For more tech related assistance or support <a href=\"https://uaedatarecovery.com/data-recovery-dubai/\">Data Recovery Dubai</a>
-"""]]
--- a/doc/news/version_7.20191230.mdwn
+++ b/doc/news/version_7.20191230.mdwn
@ -0,0 +1,25 @@
+git-annex 7.20191230 released with [[!toggle text="these changes"]]
+[[!toggleable text="""
+   * Optimised processing of many files, especially by commands like find
+     and whereis that only report on the state of the repository. Commands
+     like get also sped up in cases where they have to check a lot of
+     files but only transfer a few files. Speedups range from 30-100%.
+   * Added build dependency on the filepath-bytestring library.
+   * Fixed an oversight that had always prevented annex.resolvemerge
+     from being honored, when it was configured by git-annex config.
+   * annex.largefiles can be configured by git-annex config,
+     to more easily set a default that will also be used by clones,
+     without needing to shoehorn the expression into the gitattributes file.
+     The git config and gitattributes override that.
+   * annex.addunlocked can be set to an expression with the same format used by
+     annex.largefiles, when you want to default to unlocking some files but
+     not others.
+   * annex.addunlocked can be configured by git-annex config.
+   * git-annex-config --set/--unset: No longer change the local git config
+     setting, except for in the special case of annex.securehashesonly.
+   * Improve file ordering behavior when one parameter is "." and other
+     parameters are other directories.
+   * smudge bugfix: When annex.largefiles=anything, files that were already
+     stored in git, and have not been modified could sometimes be converted
+     to being stored in the annex. Changes in 7.20191024 made this more
+     of a problem. This case is now detected and prevented."""]]
--- a/doc/tips/largefiles.mdwn
+++ b/doc/tips/largefiles.mdwn
@ -89,7 +89,7 @@ If you've set up an annex.largefiles configuration but want to force a file to
 be stored in the annex, you can temporarily override the configuration like
 this:

-	git annex add -c annex.largefiles=anything smallfile
+	git annex add --force-large smallfile

 ## converting git to annexed

@ -97,7 +97,7 @@ When you have a file that is currently stored in git, and you want to
 convert that to be stored in the annex, here's how to accomplish that:

 	git rm --cached file
-	git annex add -c annex.largefiles=anything file
+	git annex add --force-large file
 	git commit file

 This first removes the file from git's index cache, and then adds it back
@ -111,7 +111,7 @@ convert that to be stored in git, here's how to accomplish that:

 	git annex unlock file
 	git rm --cached file
-	git -c annex.largefiles=nothing add file
+	git annex add --force-small file
 	git commit file

 You can modify the file after unlocking it and before adding it to
--- a/doc/todo/OPT58_34bundle34_get_+_check_40of_checksum41_in_a_single_operation/comment_4_82707d36ed580d62c73e441f959ccd9e._comment
+++ b/doc/todo/OPT58_34bundle34_get_+_check_40of_checksum41_in_a_single_operation/comment_4_82707d36ed580d62c73e441f959ccd9e._comment
@ -0,0 +1,41 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 4"""
+ date="2019-12-30T18:15:40Z"
+ content="""
+When -J is used, recent versions use a separate pool of worker threads for
+the checksumming than the downloading. So even with -J1 checksum of the
+previous download will not block the next download.
+
+I've thought about making this the default without -J.. It relies on
+concurrent-output working well, which it sometimes may not, eg when
+filenames are not valid unicode, or perhaps on a non-ANSI terminal, and so
+far it's been worth not defaulting to -J1 to avoid breaking in such edge
+cases.
+
+Anyway, it seems to me using -J should avoid most of the overhead, except
+of course for the remaining checksumming after all downloads finish.
+
+Incremental checksumming could be done for some of the built-in remotes,
+but not others like bittorrent which write out of order. Some transfers
+can resume, and the checksumming would have to somehow catch
+up to resume point, which adds significant complexity.
+
+External remotes would need to send the content over a pipe for incremental
+checksumming, so it would need a protocol extension.
+
+git-annex's remote API does have the concept that a remote can sufficiently
+verify the content of a file during transfer that additional checksumming
+is not necessary. Currently only used for git remotes when hard linking an
+object from a sibling remote. I don't think it actually matters what
+checksum a remote uses to do such verification, as long as it's
+cryptographically secure and runs on the local machine.
+
+A protocol extension that let an external remote communicate to git-annex
+that it had done such verification at the end of transfer is worth thinking
+about.
+
+Re Ilya's security concerns, as long as the external remote runs the
+verification on the local machine, it seems there is no added security
+impact.
+"""]]
--- a/doc/todo/OPT58_34bundle34_get_+_check_40of_checksum41_in_a_single_operation/comment_5_aad1896b58e58cbc870c4526ebb824ec._comment
+++ b/doc/todo/OPT58_34bundle34_get_+_check_40of_checksum41_in_a_single_operation/comment_5_aad1896b58e58cbc870c4526ebb824ec._comment
@ -0,0 +1,8 @@
+[[!comment format=mdwn
+ username="Ilya_Shlyakhter"
+ avatar="http://cdn.libravatar.org/avatar/1647044369aa7747829c38b9dcc84df0"
+ subject="named pipes and external remotes"
+ date="2019-12-31T01:38:03Z"
+ content="""
+\"External remotes would need to send the content over a pipe for incremental checksumming, so it would need a protocol extension\" -- in the current protocol, if you pass to the TRANSFER request, as the FILE parameter, a named pipe, would something break?
+"""]]
--- a/doc/todo/git_annex_add_option_to_control_to_where.mdwn
+++ b/doc/todo/git_annex_add_option_to_control_to_where.mdwn
@ -0,0 +1,16 @@
+Make `git-annex add --force-large` and `git-annex add --force-small` 
+add a specific file to annex or git, bypassing annex.largefiles
+and all other configuration and state.
+
+One reason to want this is that it avoids users doing stuff like this:
+
+	git -c annex.largefiles=anything annex add foo.c
+
+Such a temporary setting of annex.largefiles can be problimatic, as explored in 
+<https://git-annex.branchable.com/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/>
+
+Also, this could also be used to easily switch a file from one storage to
+the other. I suppose the file would have to be touched first to make git-annex
+add process it?
+
+> [[done]] --[[Joey]]
--- a/doc/todo/making_it_easier_to_smudge_dotfiles/comment_5_ef22ff9e2e507931baee09bf536fc5df._comment
+++ b/doc/todo/making_it_easier_to_smudge_dotfiles/comment_5_ef22ff9e2e507931baee09bf536fc5df._comment
@ -0,0 +1,18 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 5"""
+ date="2019-12-26T20:34:02Z"
+ content="""
+I've implemented annex.dotfiles in the `v8` branch.
+
+I did not tie it to annex.largefiles in the end, spwhitton is right.
+
+`git-annex add` behavior around dotfiles did change in a potentially
+surprising way, since dotfiles are assumed to be non-large, they get added
+to git. Users who have dotfiles that are large (or dotdirs containing large
+files) will need to configure annex.largefiles and annex.dotfiles to avoid
+those files being added to git. But, I don't think that will affect many
+users, and it avoided a lot of complexity. At least such users can use this
+and other semi-recent git-annex configs to avoid `git add` adding their
+large dotfiles directly to git.
+"""]]
--- a/git-annex.cabal
+++ b/git-annex.cabal
@ -1,5 +1,5 @@
 Name: git-annex
-Version: 8.20191219
+Version: 8.20200101
 Cabal-Version: >= 1.8
 License: AGPL-3
 Maintainer: Joey Hess <id@joeyh.name>
@ -296,7 +296,7 @@ source-repository head
 custom-setup
  Setup-Depends: base (>= 4.11.1.0), hslogger, split, unix-compat, process,
    filepath, exceptions, bytestring, directory, IfElse, data-default,
-    filepath-bytestring (>= 1.4.2.1.1),
+    filepath-bytestring (>= 1.4.2.1.0),
    utf8-string, transformers, Cabal

 Executable git-annex
@ -321,7 +321,7 @@ Executable git-annex
   directory (>= 1.2),
   disk-free-space,
   filepath,
-   filepath-bytestring (>= 1.4.2.1.1),
+   filepath-bytestring (>= 1.4.2.1.0),
   IfElse,
   hslogger,
   monad-logger,
--- a/stack-lts-12.14.yaml
+++ b/stack-lts-12.14.yaml
@ -23,6 +23,7 @@ extra-deps:
 - torrent-10000.1.1
 - sandi-0.5
 - http-client-0.5.14
+- filepath-bytestring-1.4.2.1.1
 explicit-setup-deps:
  git-annex: true
 resolver: lts-12.14