Merge branch 'master' into v8

This commit is contained in:
Joey Hess 2020-01-01 14:26:43 -04:00
commit 2cea674d1e
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
44 changed files with 665 additions and 140 deletions

View file

@ -1,6 +1,6 @@
{- git cat-file interface, with handle automatically stored in the Annex monad
-
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -16,6 +16,7 @@ module Annex.CatFile (
catObjectMetaData,
catFileStop,
catKey,
catKey',
catSymLinkTarget,
catKeyFile,
catKeyFileHEAD,
@ -54,7 +55,7 @@ catObject ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObject h ref
catObjectMetaData :: Git.Ref -> Annex (Maybe (Integer, ObjectType))
catObjectMetaData :: Git.Ref -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaData ref = do
h <- catFileHandle
liftIO $ Git.CatFile.catObjectMetaData h ref
@ -99,14 +100,16 @@ catFileStop = do
{- From ref to a symlink or a pointer file, get the key. -}
catKey :: Ref -> Annex (Maybe Key)
catKey ref = go =<< catObjectMetaData ref
where
go (Just (sz, _))
catKey ref = catKey' ref =<< catObjectMetaData ref
catKey' :: Ref -> Maybe (Sha, Integer, ObjectType) -> Annex (Maybe Key)
catKey' ref (Just (_, sz, _))
-- Avoid catting large files, that cannot be symlinks or
-- pointer files, which would require buffering their
-- content in memory, as well as a lot of IO.
| sz <= maxPointerSz = parseLinkTargetOrPointer . L.toStrict <$> catObject ref
go _ = return Nothing
| sz <= maxPointerSz =
parseLinkTargetOrPointer . L.toStrict <$> catObject ref
catKey' _ _ = return Nothing
{- Gets a symlink target. -}
catSymLinkTarget :: Sha -> Annex RawFilePath
@ -151,7 +154,7 @@ catKeyFileHEAD f = catKey $ Git.Ref.fileFromRef Git.Ref.headRef f
catKeyFileHidden :: RawFilePath -> CurrBranch -> Annex (Maybe Key)
catKeyFileHidden = hiddenCat catKey
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Integer, ObjectType))
catObjectMetaDataHidden :: RawFilePath -> CurrBranch -> Annex (Maybe (Sha, Integer, ObjectType))
catObjectMetaDataHidden = hiddenCat catObjectMetaData
hiddenCat :: (Ref -> Annex (Maybe a)) -> RawFilePath -> CurrBranch -> Annex (Maybe a)

View file

@ -21,7 +21,7 @@ hashObjectHandle :: Annex Git.HashObject.HashObjectHandle
hashObjectHandle = maybe startup return =<< Annex.getState Annex.hashobjecthandle
where
startup = do
h <- inRepo $ Git.HashObject.hashObjectStart
h <- inRepo $ Git.HashObject.hashObjectStart True
Annex.changeState $ \s -> s { Annex.hashobjecthandle = Just h }
return h

View file

@ -125,7 +125,7 @@ makeinfos updated version = do
now <- liftIO getCurrentTime
liftIO $ putStrLn $ "building info files"
forM_ updated $ \(f, bv) -> do
v <- lookupFile f
v <- lookupFile (toRawFilePath f)
case v of
Nothing -> noop
Just k -> whenM (inAnnex k) $ do
@ -153,6 +153,7 @@ makeinfos updated version = do
, Param "move"
, Param "--to"
, Param "website"
, Param "--force"
]
void $ inRepo $ runBool
[ Param "annex"

View file

@ -25,7 +25,15 @@ git-annex (8.20191107) UNRELEASED; urgency=medium
-- Joey Hess <id@joeyh.name> Tue, 29 Oct 2019 15:13:03 -0400
git-annex (7.20191219) UNRELEASED; urgency=medium
git-annex (7.20191231) UNRELEASED; urgency=medium
* add: --force-annex/--force-git options make it easier to override
annex.largefiles configuration (and potentially safer as it avoids
bugs like the smudge bug fixed in the last release).
-- Joey Hess <id@joeyh.name> Wed, 01 Jan 2020 12:51:40 -0400
git-annex (7.20191230) upstream; urgency=medium
* Optimised processing of many files, especially by commands like find
and whereis that only report on the state of the repository. Commands
@ -46,8 +54,12 @@ git-annex (7.20191219) UNRELEASED; urgency=medium
setting, except for in the special case of annex.securehashesonly.
* Improve file ordering behavior when one parameter is "." and other
parameters are other directories.
* smudge bugfix: When annex.largefiles=anything, files that were already
stored in git, and have not been modified could sometimes be converted
to being stored in the annex. Changes in 7.20191024 made this more
of a problem. This case is now detected and prevented.
-- Joey Hess <id@joeyh.name> Wed, 18 Dec 2019 15:12:40 -0400
-- Joey Hess <id@joeyh.name> Mon, 30 Dec 2019 12:43:30 -0400
git-annex (7.20191218) upstream; urgency=medium

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2010-2019 Joey Hess <id@joeyh.name>
- Copyright 2010-2020 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -17,9 +17,13 @@ import qualified Database.Keys
import Annex.FileMatcher
import Annex.Link
import Annex.Tmp
import Annex.HashObject
import Messages.Progress
import Git.Types
import Git.FilePath
import Config.GitConfig
import qualified Git.UpdateIndex
import Utility.FileMode
import qualified Utility.RawFilePath as R
cmd :: Command
@ -32,6 +36,7 @@ data AddOptions = AddOptions
{ addThese :: CmdParams
, batchOption :: BatchMode
, updateOnly :: Bool
, largeFilesOverride :: Maybe Bool
}
optParser :: CmdParamsDesc -> Parser AddOptions
@ -43,13 +48,24 @@ optParser desc = AddOptions
<> short 'u'
<> help "only update tracked files"
)
<*> (parseforcelarge <|> parseforcesmall)
where
parseforcelarge = flag Nothing (Just True)
( long "force-large"
<> help "add all files to annex, ignoring other configuration"
)
parseforcesmall = flag Nothing (Just False)
( long "force-small"
<> help "add all files to git, ignoring other configuration"
)
seek :: AddOptions -> CommandSeek
seek o = startConcurrency commandStages $ do
largematcher <- largeFilesMatcher
addunlockedmatcher <- addUnlockedMatcher
annexdotfiles <- getGitConfigVal annexDotFiles
let gofile file =
let gofile file = case largeFilesOverride o of
Nothing ->
let file' = fromRawFilePath file
in ifM (pure (annexdotfiles || not (dotfile file')) <&&> (checkFileMatcher largematcher file' <||> Annex.getState Annex.force))
( start file addunlockedmatcher
@ -58,6 +74,8 @@ seek o = startConcurrency commandStages $ do
, stop
)
)
Just True -> start file addunlockedmatcher
Just False -> startSmallOverridden file
case batchOption o of
Batch fmt
| updateOnly o ->
@ -81,6 +99,29 @@ addSmall file = do
showNote "non-large file; adding content to git repository"
addFile file
startSmallOverridden :: RawFilePath -> CommandStart
startSmallOverridden file = starting "add" (ActionItemWorkTreeFile file) $
next $ addSmallOverridden file
addSmallOverridden :: RawFilePath -> Annex Bool
addSmallOverridden file = do
showNote "adding content to git repository"
let file' = fromRawFilePath file
s <- liftIO $ getFileStatus file'
if isSymbolicLink s
then addFile file
else do
-- Can't use addFile because the clean filter will
-- honor annex.largefiles and it has been overridden.
-- Instead, hash the file and add to the index.
sha <- hashFile file'
let ty = if isExecutable (fileMode s)
then TreeExecutable
else TreeFile
Annex.Queue.addUpdateIndex =<<
inRepo (Git.UpdateIndex.stageFile sha ty file')
return True
addFile :: RawFilePath -> Annex Bool
addFile file = do
ps <- forceParams

View file

@ -18,7 +18,10 @@ import Logs.Location
import qualified Database.Keys
import qualified Git.BuildVersion
import Git.FilePath
import Git.Types
import Git.HashObject
import qualified Git
import qualified Git.Ref
import qualified Annex
import Backend
import Utility.Metered
@ -89,11 +92,14 @@ clean file = do
Just k -> do
getMoveRaceRecovery k (toRawFilePath file)
liftIO $ L.hPut stdout b
Nothing -> go b =<< catKeyFile (toRawFilePath file)
Nothing -> do
let fileref = Git.Ref.fileRef (toRawFilePath file)
indexmeta <- catObjectMetaData fileref
go b indexmeta =<< catKey' fileref indexmeta
)
stop
where
go b oldkey = ifM (shouldAnnex file oldkey)
go b indexmeta oldkey = ifM (shouldAnnex file indexmeta oldkey)
( do
-- Before git 2.5, failing to consume all stdin here
-- would cause a SIGPIPE and crash it.
@ -154,17 +160,17 @@ clean file = do
-- added to the annex, so will be added to git. But some heuristics
-- are used to avoid bad behavior:
--
-- If the index already contains the file, preserve its annexed/not annexed
-- state. This prevents accidental conversions.
-- If the file is annexed in the index, keep it annexed.
-- This prevents accidental conversions.
--
-- Otherwise, when the file's inode is the same as one that was used for
-- annexed content before, annex it. This handles cases such as renaming an
-- unlocked annexed file followed by git add, which the user naturally
-- expects to behave the same as git mv.
shouldAnnex :: FilePath -> Maybe Key -> Annex Bool
shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
( checkmatcher checkheuristics
, checkheuristics
shouldAnnex :: FilePath -> Maybe (Sha, FileSize, ObjectType) -> Maybe Key -> Annex Bool
shouldAnnex file indexmeta moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
( checkunchangedgitfile $ checkmatcher checkheuristics
, checkunchangedgitfile checkheuristics
)
where
checkmatcher d
@ -186,6 +192,31 @@ shouldAnnex file moldkey = ifM (annexGitAddToAnnex <$> Annex.getGitConfig)
Nothing -> pure False
Just ic -> Database.Keys.isInodeKnown ic =<< sentinalStatus
-- This checks for a case where the file had been added to git
-- previously, not to the annex before, and its content is not
-- changed, but git is running the clean filter again on it
-- (eg because its mtime or inode changed, or just because git feels
-- like it). Such a file should not be added to the annex, even if
-- annex.largefiles now matches it, because the content is not
-- changed.
checkunchangedgitfile cont = case (moldkey, indexmeta) of
(Nothing, Just (sha, sz, _)) -> liftIO (catchMaybeIO (getFileSize file)) >>= \case
Just sz' | sz' == sz -> do
-- The size is the same, so the file
-- is not much larger than what was stored
-- in git before, so it won't be out of
-- line to hash it. However, the content
-- is prevented from being stored in git
-- when hashing.
h <- inRepo $ hashObjectStart False
sha' <- liftIO $ hashFile h file
liftIO $ hashObjectStop h
if sha' == sha
then return False
else cont
_ -> cont
_ -> cont
emitPointer :: Key -> IO ()
emitPointer = S.putStr . formatPointer

View file

@ -1,6 +1,6 @@
{- git cat-file interface
-
- Copyright 2011-2018 Joey Hess <id@joeyh.name>
- Copyright 2011-2019 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
@ -33,6 +33,7 @@ import Text.Read
import Common
import Git
import Git.Sha
import qualified Git.Ref
import Git.Command
import Git.Types
import Git.FilePath
@ -109,22 +110,23 @@ catObjectDetails h object = query (catFileProcess h) object newlinefallback $ \f
return (Just (content, sha, objtype))
{- Gets the size and type of an object, without reading its content. -}
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Integer, ObjectType))
catObjectMetaData :: CatFileHandle -> Ref -> IO (Maybe (Sha, FileSize, ObjectType))
catObjectMetaData h object = query (checkFileProcess h) object newlinefallback $ \from -> do
resp <- hGetLine from
case parseResp object resp of
Just (ParsedResp _ size objtype) ->
return $ Just (size, objtype)
Just (ParsedResp sha size objtype) ->
return $ Just (sha, size, objtype)
Just DNE -> return Nothing
Nothing -> error $ "unknown response from git cat-file " ++ show (resp, object)
where
-- Slow fallback path for filenames containing newlines.
newlinefallback = do
sha <- Git.Ref.sha object (gitRepo h)
sz <- querySize object (gitRepo h)
objtype <- queryObjectType object (gitRepo h)
return $ (,) <$> sz <*> objtype
return $ (,,) <$> sha <*> sz <*> objtype
data ParsedResp = ParsedResp Sha Integer ObjectType | DNE
data ParsedResp = ParsedResp Sha FileSize ObjectType | DNE
query :: CoProcess.CoProcessHandle -> Ref -> IO a -> (Handle -> IO a) -> IO a
query hdl object newlinefallback receive
@ -180,7 +182,7 @@ querySingle o r repo reader = assertLocal repo $
, return Nothing
)
querySize :: Ref -> Repo -> IO (Maybe Integer)
querySize :: Ref -> Repo -> IO (Maybe FileSize)
querySize r repo = maybe Nothing (readMaybe . takeWhile (/= '\n'))
<$> querySingle (Param "-s") r repo hGetContentsStrict

View file

@ -46,8 +46,8 @@ prop_encode_decode_roundtrip s = s' ==
-- "\343\200\271".
--
-- This property papers over the problem, by only
-- testing chars < 256.
nohigh = filter (\c -> ord c < 256)
-- testing ascii
nohigh = filter isAscii
-- A String can contain a NUL, but toRawFilePath
-- truncates on the NUL, which is generally fine
-- because unix filenames cannot contain NUL.

View file

@ -23,12 +23,12 @@ import Data.ByteString.Builder
type HashObjectHandle = CoProcess.CoProcessHandle
hashObjectStart :: Repo -> IO HashObjectHandle
hashObjectStart = gitCoProcessStart True
[ Param "hash-object"
, Param "-w"
, Param "--stdin-paths"
, Param "--no-filters"
hashObjectStart :: Bool -> Repo -> IO HashObjectHandle
hashObjectStart writeobject = gitCoProcessStart True $ catMaybes
[ Just (Param "hash-object")
, if writeobject then Just (Param "-w") else Nothing
, Just (Param "--stdin-paths")
, Just (Param "--no-filters")
]
hashObjectStop :: HashObjectHandle -> IO ()

View file

@ -117,5 +117,5 @@ parseRemoteLocation s repo = ret $ calcloc s
-- git on Windows will write a path to .git/config with "drive:",
-- which is not to be confused with a "host:"
dosstyle = hasDrive
dospath = fromInternalGitPath
dospath = fromRawFilePath . fromInternalGitPath . toRawFilePath
#endif

View file

@ -31,7 +31,7 @@ import Git.FilePath
-}
merge :: Ref -> Ref -> Repo -> IO ()
merge x y repo = do
hashhandle <- hashObjectStart repo
hashhandle <- hashObjectStart True repo
ch <- catFileStart repo
streamUpdateIndex repo
[ lsTree x repo

5
Key.hs
View file

@ -81,10 +81,9 @@ instance Arbitrary KeyData where
-- AssociatedFile cannot be empty, and cannot contain a NUL
-- (but can be Nothing)
instance Arbitrary AssociatedFile where
arbitrary = (AssociatedFile . fmap mk <$> arbitrary)
arbitrary = (AssociatedFile . fmap toRawFilePath <$> arbitrary)
`suchThat` (/= AssociatedFile (Just S.empty))
where
mk = toRawFilePath . filter (/= '\NUL')
`suchThat` (\(AssociatedFile f) -> maybe True (S.notElem 0) f)
instance Arbitrary Key where
arbitrary = mkKey . const <$> arbitrary

View file

@ -253,7 +253,7 @@ osxapp:
sleep 60; \
fi \
fi \
done
done; if [ $$ok = 0 ]; then exit 1; fi
# Bypass cabal, and only run the main ghc --make command for a
# faster development build.

View file

@ -8,7 +8,9 @@
{-# LANGUAGE CPP #-}
module Utility.Daemon (
#ifndef mingw32_HOST_OS
daemonize,
#endif
foreground,
checkDaemon,
stopDaemon,

View file

@ -43,7 +43,6 @@ import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.UTF8 as S8
import qualified Data.ByteString.Lazy.UTF8 as L8
#endif
import System.FilePath.ByteString (RawFilePath, encodeFilePath, decodeFilePath)
import Utility.Exception
import Utility.Split
@ -172,11 +171,21 @@ encodeBL' = L.pack . decodeW8
encodeBL' = L8.fromString
#endif
fromRawFilePath :: RawFilePath -> FilePath
fromRawFilePath = decodeFilePath
{- Recent versions of the unix package have this alias; defined here
- for backwards compatibility. -}
type RawFilePath = S.ByteString
{- Note that the RawFilePath is assumed to never contain NUL,
- since filename's don't. This should only be used with actual
- RawFilePaths not arbitrary ByteString that may contain NUL. -}
fromRawFilePath :: RawFilePath -> FilePath
fromRawFilePath = decodeBS'
{- Note that the FilePath is assumed to never contain NUL,
- since filename's don't. This should only be used with actual FilePaths
- not arbitrary String that may contain NUL. -}
toRawFilePath :: FilePath -> RawFilePath
toRawFilePath = encodeFilePath
toRawFilePath = encodeBS'
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
-

View file

@ -15,7 +15,7 @@ module Utility.Format (
) where
import Text.Printf (printf)
import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord)
import Data.Char (isAlphaNum, isOctDigit, isHexDigit, isSpace, chr, ord, isAscii)
import Data.Maybe (fromMaybe)
import Data.Word (Word8)
import Data.List (isPrefixOf)
@ -176,12 +176,12 @@ encode_c' p = concatMap echar
{- For quickcheck.
-
- Encoding and then decoding roundtrips only when
- the string does not contain high unicode, because eg,
- both "\12345" and "\227\128\185" are encoded to "\343\200\271".
- the string is ascii because eg, both "\12345" and
- "\227\128\185" are encoded to "\343\200\271".
-
- This property papers over the problem, by only testing chars < 256.
- This property papers over the problem, by only testing ascii.
-}
prop_encode_c_decode_c_roundtrip :: String -> Bool
prop_encode_c_decode_c_roundtrip s = s' == decode_c (encode_c s')
where
s' = filter (\c -> ord c < 256) s
s' = filter isAscii s

View file

@ -31,6 +31,7 @@ doesPathExist = fileExist
#else
import qualified Data.ByteString as B
import System.PosixCompat (FileStatus)
import qualified System.PosixCompat as P
import qualified System.Directory as D
import Utility.FileSystemEncoding

1
debian/control vendored
View file

@ -78,6 +78,7 @@ Build-Depends:
libghc-magic-dev,
libghc-socks-dev,
libghc-vector-dev,
libghc-filepath-bytestring-dev,
lsof [linux-any],
ikiwiki,
libimage-magick-perl,

View file

@ -75,3 +75,5 @@ Thanks for having a look.
[[!meta author=kyle]]
[[!tag projects/datalad]]
> [[fixed|done]] --[[Joey]]

View file

@ -0,0 +1,42 @@
[[!comment format=mdwn
username="joey"
subject="""comment 2"""
date="2019-12-27T06:22:23Z"
content="""
On second thought, making the clean filter check for non-annexed files
would prevent use cases like annex.largefiles=largerthan(100kb)
from working as the user intended and letting a small file start out
non-annexed and get annexed once it gets too large. Users certianly rely on
that and this bug that only affects an edge case does not justify breaking
that.
What would work to make the clean filter detect when a file's content
has not changed, though its mtime (or inode) has changed. In that case,
it's reasonable for the clean filter to ignore annex.largefiles and keep
the content represented in git however it already was (non-annexed or
annexed).
To detect that, in the case where the file in the index is not annexed:
First check if the file size is the same as the
size in the index. If it is, run git hash-object on the file, and see if
the sha1 is the same as in the index. This avoids hashing any unusually
large files, so the clean filter only gets a bit slower.
And when the file in the index is annexed, check if the file size is the
same as the size of the annexed key. If it is, verify if the file content
matches the key. (typically be hashing). Cases where keys lack size or
don't use a checksum could lead to false positives or negatives though.
Although, I've not managed to find a version of this bug that makes an
annexed file get converted to git unintentionally, so maybe this part does
not need to be done?
----
Or.. Since the root of the problem is temporarily overriding annex.largefiles,
it could just be documented that it's not a good idea to use
-c annex.largefiles=anything/nothing, because such broad overrides
can affect other files than the ones you intended.
(And since the documented methods of converting files from annexed to git and
git to annexed use such overrides, that documentation would need to be
changed.)
"""]]

View file

@ -0,0 +1,16 @@
[[!comment format=mdwn
username="joey"
subject="""comment 3"""
date="2019-12-27T17:11:42Z"
content="""
A variant of this where an annexed unlocked file is added first,
then the file is touched, and then some other file is added
with -c annex.largefiles=nothing does result in the clean filter sending
the whole annexed file content back to git, rather than keeping it annexed.
For whatever reason, git does not store that content in .git/objects or
update the index for that file though, so it doesn't show up as a change.
So *apparently* that variant is only potentially an expensive cat of a
large annexed file, and does not need to be dealt with. Unless git
sometimes behaves otherwise.
"""]]

View file

@ -0,0 +1,45 @@
[[!comment format=mdwn
username="joey"
subject="""comment 4"""
date="2019-12-27T18:41:12Z"
content="""
It's almost possible to get the same unwanted conversion without any git
races:
echo content-git > file-git
sleep 2
git add file-git
git commit -m add
echo foo > file-git
echo content-annex > file-annex
git -c annex.largefiles=anything annex add file-annex
In this case, git currently does not run the modified file-git through the
clean filter in the last line, so the annex.largefiles=anything doesn't
affect it.
But, as far as I can see, there's nothing preventing a future version
of git from deciding it does want to run file-git through the clean filter
in this case.
I am not going to try to prevent against such a thing happening.
As far as I can see, anything that the clean filter can possibly do to
avoid such a situation will cripple existing uses cases of
annex.largefiles, like largerthan() as mentioned above.
The user has told git-annex to annex "anything", and if git
decides to run the clean filter while that is in effect, caveat emptor.
Which is not to say I'm not going to fix the specific case this bug was
filed about. I actually have a fix developed now. But just to say that
setting annex.largefiles=anything/nothing temporarily is a blunt instrument,
and you risk accidental conversion when using it, and so it would be a good
idea to not do that.
One idea: Make `git-annex add --annex` and `git-annex add --git`
add a specific file to annex or git, bypassing annex.largefiles and all
other configuration and state. This could also be used to easily switch
a file from one storage to the other. I'd hope the existence of that
would prevent one-off setting of annex.largefiles=anything/nothing.
[[todo/git_annex_add_option_to_control_to_where]]
"""]]

View file

@ -0,0 +1,58 @@
[[!comment format=mdwn
username="kyle"
avatar="http://cdn.libravatar.org/avatar/7d6e85cde1422ad60607c87fa87c63f3"
subject="comment 5"
date="2019-12-28T21:06:46Z"
content="""
Thanks for the explanation and the fix.
> For whatever reason, git becomes confused about whether this file is
> modified. I seem to recall that git distrusts information it recorded in
> its own index if the mtime of the index file is too close to the
> mtime recorded inside it, or something like that.
I see. I think the problem and associated workaround you're referring
to is described in git's Documentation/technical/racy-git.txt.
> Note that, you can accomplish the same thing without setting
> annex.largefiles, assuming a current version of git-annex:
>
> git add file-git
> git annex add file-annex
>
> I think the only reason for setting annex.largefiles in either of the two
> places you did is if there's a default value that you want to
> temporarily override?
Right. DataLad's methods that are responsible for calling out to `git
annex add` have a `git={None,False,True}` parameter. By default
(`None`), DataLad just calls `git annex add ...` and let's any
configuration in the repo control whether the file goes to git or is
annexed. But with `git=True` or `git=False`, the `annex add` call
includes a `-c annex.largefiles=` argument with a value of `nothing`
or `anything`, respectively.
> But just to say that setting annex.largefiles=anything/nothing
> temporarily is a blunt instrument, and you risk accidental
> conversion when using it, and so it would be a good idea to not do
> that.
Noted. As mentioned above, DataLad's default behavior is to honor the
repo's `annex.largefiles` configuration. And the documentation for
`datalad save`, DataLad's main user-facing entry point for `annex
add`, recommends that the user configure .gitattributes rather than
using the option that leads calling `annex add` with `-c
annex.largefiles=nothing`.
> One idea: Make `git-annex add --annex` and `git-annex add --git`
> add a specific file to annex or git, bypassing annex.largefiles and all
> other configuration and state. This could also be used to easily switch
> a file from one storage to the other. I'd hope the existence of that
> would prevent one-off setting of annex.largefiles=anything/nothing.
As far as I can see, those flags would completely cover DataLad's
one-off setting of `annex.largefiles=anything/nothing`. They map
directly to DataLad's `git=False/True` option described above. So,
from DataLad's perspective, they'd be very useful and welcome.
"""]]

View file

@ -0,0 +1,34 @@
[[!comment format=mdwn
username="sirio@84e81889437b3f6208201a26e428197c6045c337"
nickname="sirio"
avatar="http://cdn.libravatar.org/avatar/9f3a0cfaf4825081710b652cc0b438a4"
subject="Duplicate 'gcrypt-id' may be the issue?"
date="2019-12-29T22:10:26Z"
content="""
Had a repo exhibit this behavior just now:
- commit graph `XX -> YY`
- host `A` @ commit `YY`
- host `B` @ commit `XX` (1 behind)
- remotes `hub` and `lab` both @ commit `XX`
- `B` pushes and pulls from both `hub` and `lab`: OK
- `A` pushes to `hub` (updates to commit `YY`): OK
- `B` pulls from `hub`: FAIL with *Packfile does not match digest*
- `B` pulls from `lab`: OK
- `B` pushes to `hub`: FAIL with *Packfile does not match digest*
- `A` pulls from `hub`: OK
- `A` pulls from `lab`: OK
When looking in `.git/config` I noticed that `remote.hub.gcrypt-id` and `remote.lab.gcrypt-id` were identical.
To fix, I:
- removed `remote.hub.gcrypt-id` from `.git/config` on both `A` and `B`
- deleted and re-created a blank repo on `hub`
- `git push hub` on `B`
- `git pull hub master` on `A`
This resulted in a new and unique value for `remote.hub.gcrypt-id`, which is the same on both `A` and `B`.
Have not had time to dig into why but this is the only thread I can find about this problem so I figured I would log this somewhere for posterity.
"""]]

View file

@ -0,0 +1,97 @@
### Please describe the problem.
I have the following repos
a - group manual - all content currently originates on this repo (OS X 10.14.4)
b - group backup - this is a rclone special backed by google drive
c - this is the underlying git repo on gitlab.com
d - group backup - a server that is supposed to backup everything (OS X 10.14.4)
Assistant is running on a and d
It is not guaranteed that a and d will be able to directly connect, however, they both have very good connectivity to b and c
When I add a set of files into a (using git-annex add) the non-annex files get checked into the git repo and pushed to c. Similarly, the content (annex files) get pushed to b. This is confirmed by git-anenx list --allrepos
Within an hour or so, d will know about the files that were added (git-annex list) and the git log shows that it is on the same commit as a and c.
However, the assistant on d never does the git-annex sync --content
If I manually run git-annex sync --content on d, all is updated as expected.
I've made no changes to the groupwants, group, etc. settings
### What steps will reproduce the problem?
create a repo with a central git upstream and a special via rclone on gdrive. Clone that repo in another machine that can also see the upstream and special, but isn't directly connected to the originator of the repo
Add annex-handled files to the original repo.
Check the status of the git upstream, special, and the clone.
After failure is acknowledged, run git annex sync --content to confirm that the mechanics still work
### What version of git-annex are you using? On what operating system?
Both hosts are OSX 10.14.4 and are running 7.20191218
### Please provide any additional information below.
This is from the assistant on the clone. It is running in debug mode.
[[!format sh """
[2019-12-30 17:44:09.362492] main: starting assistant version 7.20191114
[2019-12-30 17:44:14.532638] TransferScanner: Syncing with origin
(scanning...) [2019-12-30 17:44:14.590159] Watcher: Performing startup scan
ControlSocket .git/annex/ssh/git@gitlab already exists, disabling multiplexing
Disallowed command
Everything up-to-date
Disallowed command
Disallowed command
Disallowed command
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
fatal: Pathspec 'workflow/cc-archive-exif/LICENSE' is in submodule 'workflow/cc-archive-exif'
git cat-file EOF: user error
fd:38: hFlush: resource vanished (Broken pipe)
fd:38: hFlush: resource vanished (Broken pipe)
Disallowed command
(started...)
[2019-12-30 17:44:33.097035] Committer: Committing changes to git
(recording state in git...)
[2019-12-30 17:44:33.176213] Pusher: Syncing with origin
Everything up-to-date
Disallowed command
<<A bunch of white space lines removed for brevity>>
Disallowed command
Disallowed command
Disallowed command
Disallowed command
Disallowed command
# End of transcript or log.
"""]]
### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders)
Yes - I can run this manually, and overall this is great - I would just love to get this automated....

View file

@ -20,3 +20,5 @@ If strict matching (not sure yet about a use case where it would really be neede
[[!meta author=yoh]]
[[!tag projects/datalad]]
> Looks like we're agreed this is not necessary, so [[done]] --[[Joey]]

View file

@ -0,0 +1,30 @@
### Please describe the problem.
enable-tor on an OSX box (with magic-wormhole and tor installed via brew) fails miserably.
### What steps will reproduce the problem?
run git-annex enable-tor - multiple fails, see details.
### What version of git-annex are you using? On what operating system?
7.20191106
OSX 10.14.5
### Please provide any additional information below.
The first failure is that enable-tor can't run as root. Instead, I call it with sudo git-annex enable-tor <UID>
The second failure is that you try and write into /etc/tor/torrc - which is not where torrc is located on a brew installed tor - it's in /usr/local/etc/tor/torrc. I made a symlink to get around that problem.
The third failure is a complaint about systemctl not being present. I looked in Utilities/Tor.hc and saw you were trying to call for a reload of tor. To hack around that, I wrote a script called systemctl that simply called 'brew services' with the args passed in ( brew services $1 $2 ).
After that, I still get the error: git-annex: tor failed to create hidden service, perhaps the tor service is not running
I have restarted tor manually, and it is indeed running. It looks like something is failing in setting up the Onion socket, but I can't see what
### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders)
I love it - using it to protect my photo archive now using a central special repo (rclone) for the data, and a gitlab repo for the base.

View file

@ -0,0 +1,41 @@
I am interested in using or extending git-annex to manage storage redundancy more efficiently than [[git-annex-numcopies]] allows. In particular, I wish to be able to tolerate the loss of a repository when I don't have enough disk space for `numcopies 2` -- when my disk capacity is less than double the data size.
For example, suppose I wish to store N = 1000 GB on k = 10 servers, each with 150 GB storage (so total storage is 1500 GB, which is less than the 2000 GB that would be required for `numcopies 2`). For simplicity here, we'll presume that our data is in 1000 1GB files.
We'll set the goal of being able to lose r = 3 servers without losing any data (which would reduce our total storage capacity to 7 * 150 GB = 1050 GB).
This can be done by thinking of our files in groups of seven (k - r) and using parchive2 or similar to create 3 (r) parity files for each set:
```
Parity group 000: D000 D001 D002 D003 D004 D005 D006 P000.0 P000.1 P000.2
Parity group 001: D007 D008 D009 D010 D011 D012 D013 P001.0 P001.1 P001.2
...
Parity group 141: D987 D988 D989 D990 D991 D992 D993 P141.0 P141.1 P141.2
Parity group 142: D994 D995 D996 D997 D998 D999 P142.0 P142.1 P142.2
```
All files are still 1G -- we have used 1429 GB of storage, far less than the 4000 GB that would be required to achieve the same durability with `numcopies 4`.
The trouble is that we now need to ensure that each parity group's files are scattered across all the repositories. D001 must go in a different repository than D000. D0002 must go in a different repository than the first two, etc. git-annex's [[required_content]] mechanism allows allocating files to repositories based on properties of that file, not the presence or absence of other files.
Generalizing to differently-sized files, differently-sized repositories, parity group sizes other than k, k changing over time, per-file r values, and trying to minimize churn, it seems to me that the simplest way to do this would be to run a solver that finds a 'best' allocation plan and then writes a metadata tag on every file with its intended home. Each repository can then have a simple [[required_content]] rule with its own name and [[--auto|walkthrough/automatically_managing_content]] or the assistant can move the files around as necessary. I'm imagining running the solver periodically, from a cron job or in the assistant. The other moving part is launching `par2` to create parity files do recovery, which happens after the solver has commanded that all the relevant inputs be brought together on one machine and after `get --auto` invocations or the assistant have done the transfers.
User interface points:
1. User specifies how much disk space the solver should use in each repository.
2. User specifies the desired durability of each file. Probably with match expressions?
3. User declares a repository lost with [[git-annex-dead]]. The solver will notice many degraded parity groups on the next run & emit a plan to start bringing together parity groups' files for `par2 r` runs. Ideally, running `git-anenx-dead` would would stop any existing solver run & start a fresh solver run immediately.
4. Report to user parity placement health (are all the files where they should be? Is recovery finished? About how much I/O needs to be done to finish?)
5. Report to user the available storage space for additional content at each durability level (this is non-trivial given differently-sized repositories)
6. Report to user the storage efficiency of the current arrangement, how much storage efficiency could be improved at various I/O costs, and allowing the user to select a (eg: re-grouping old and recently-written files into new parity groups would allow closer file-size matching, allowing smaller parity files for the smaller file set.)
7. User specifies parity group size?? This is a trade-off between storage efficiency (larger groups → more efficient) and high-durability storage capacity when repository sizes differ (smaller groups AND repository size diversity → more high-durability storage capacity). It's possible that there's some principled way to save the user the trouble of having to understand the mechanics of parity groups & choose a parity group size, but I'm unclear on how that would work, so my initial plan is to have the user specify this by the same mechanism that they specify desired durability. A good UI could support this choice by showing what the effect of each option would be on storage capacity at each durability level.
See also
* [[todo/Wishlist__58___Parity_files_on_all_files]]
* [[todo/wishlist__58___Parity_files_for_encrypted_remotes]]
* [[design/assistant/partial_content]]
* [[forum/sparse_git_checkouts_with_annex]]

View file

@ -0,0 +1,8 @@
Suppose there are three repo: A, B and transfer, transfer is a webdav remote.
1. I add file-1 to A, do a sync content, file-1 is sent to transfer.
2. I add file-2 to A, do a sync content, I expect only file-2 to be sent to transfer, but file-1 is sent another time.
How can I avoid that?
Does other special remote like the rsync special remote has this issue?

View file

@ -36,6 +36,17 @@ annexed content, and other symlinks.
Add gitignored files.
* `--force-large`
Treat all files as large files, ignoring annex.largefiles and annex.dotfiles
configuration, and add to the annex.
* `--force-small`
Treat all files as small files, ignoring annex.largefiles and annex.dotfiles
configuration, and add to git, also ignoring annex.addsmallfiles
configuration.
* `--backend`
Specifies which key-value backend to use.

View file

@ -1,12 +0,0 @@
[[!comment format=mdwn
username="rrnewton@63c9faa1997c908b1dc04dfdca33c809660cd158"
nickname="rrnewton"
avatar="http://cdn.libravatar.org/avatar/638acc3e55c2bb09aa0dcca5b5c8acb6"
subject="Flag to force same behavior as annex.largefiles attribute?"
date="2018-05-21T05:29:06Z"
content="""
When in [direct mode](https://git-annex.branchable.com/direct_mode), the \"add the non-large file directly to the git repository\" behavior described above is very useful, because the option of typing simply `git add foo`, does not exist as it does in [indirect mode](https://git-annex.branchable.com/git-annex-indirect/).
However, I can't see any combination of flags that trigger this behavior. I suppose it can be accomplished by temporarily setting [annex.largefiles](https://git-annex.branchable.com/tips/largefiles/) to a huge value before executing `git annex add` (i.e. creating a `.gitattributes` and then deleting it). I think I'll try that as a work-around, but it would be great to have a flag that accomplishes this.
"""]]

View file

@ -1,12 +0,0 @@
[[!comment format=mdwn
username="joey"
subject="""comment 2"""
date="2018-05-21T16:36:51Z"
content="""
@rrnewton I know people do commonly accomplish this
by something like `git -c annex.largefiles='exclude(*)' annex add`
A shorter way to write that would only be useful for direct mode,
so I'm inclined not to add it, but open a todo item if you want to discuss
that.
"""]]

View file

@ -1,14 +0,0 @@
[[!comment format=mdwn
username="rrnewton@63c9faa1997c908b1dc04dfdca33c809660cd158"
nickname="rrnewton"
avatar="http://cdn.libravatar.org/avatar/638acc3e55c2bb09aa0dcca5b5c8acb6"
subject="Sounds great!"
date="2018-05-21T18:09:35Z"
content="""
That's fabulous. A Bash alias around that command is really all I need when working in direct mode. (And the archive's too damn big to switch back and forth between direct/indirect.)
I was just too much a newb with git attributes to know it could be done that way. For discoverability, maybe that command could be placed in an \"examples\" section in the primary documentation above?
"""]]

View file

@ -1,8 +0,0 @@
[[!comment format=mdwn
username="timeless-ventricle"
avatar="http://cdn.libravatar.org/avatar/0b220fa4c0b59e883f360979ee745d63"
subject="comment 4"
date="2019-01-06T12:24:49Z"
content="""
@joey I'm obviously missing something here, why would a shorter way to write that only be useful for direct mode? I don't understand what the connection is between direct mode and wanting to specify whether this is a \"regular git\" file or an annexed file (except that direct mode is not supported in v7)? I thought it was considered supported to have a mix of both large binary files and text files? Even if some text files are large, I think I want to add them as files whose content is tracked by git, so I think I want to choose 'by hand' -- is that not really supported / considered a bad idea for some reason?
"""]]

View file

@ -1,10 +0,0 @@
[[!comment format=mdwn
username="joey"
subject="""comment 5"""
date="2019-01-22T21:10:37Z"
content="""
Because "git add foo" does not work in direct mode.
This is really not the place to be having a conversation about this. If you
want something changed in git-annex, open a bug report or todo item.
"""]]

View file

@ -1,16 +0,0 @@
[[!comment format=mdwn
username="johnmario.itec19@69a7b742534851b36216e0f951f1a00dbb9067cd"
nickname="johnmario.itec19"
avatar="http://cdn.libravatar.org/avatar/2f07ffce1656bdcd6aa19aaab7517975"
subject="commenting on git-annex-add"
date="2019-09-02T06:21:27Z"
content="""
Yes you can do that. Simplest way is to git add the files you want to directly be in the git repo (e.g. the source code) and git annex add the large files.
You can then check in any changes to the source code files (or anything else you added with git add) to github as normal.
You can manage the storage and versioning of the large files using git annex commands. Git annex supports using AWS S3 and/or glacier for backing up the files. It can also back them up to a server you control over ssh or to an external drive (or any combination of the above). http://git-annex.branchable.com/special_remotes/
With the latest version of git annex, you can also set up automatically filters that decide which types/sizes of files to check in directly to git vs which ones to store as links in the annex. https://git-annex.branchable.com/tips/largefiles/
For more tech related assistance or support <a href=\"https://uaedatarecovery.com/data-recovery-dubai/\">Data Recovery Dubai</a>
"""]]

View file

@ -0,0 +1,25 @@
git-annex 7.20191230 released with [[!toggle text="these changes"]]
[[!toggleable text="""
* Optimised processing of many files, especially by commands like find
and whereis that only report on the state of the repository. Commands
like get also sped up in cases where they have to check a lot of
files but only transfer a few files. Speedups range from 30-100%.
* Added build dependency on the filepath-bytestring library.
* Fixed an oversight that had always prevented annex.resolvemerge
from being honored, when it was configured by git-annex config.
* annex.largefiles can be configured by git-annex config,
to more easily set a default that will also be used by clones,
without needing to shoehorn the expression into the gitattributes file.
The git config and gitattributes override that.
* annex.addunlocked can be set to an expression with the same format used by
annex.largefiles, when you want to default to unlocking some files but
not others.
* annex.addunlocked can be configured by git-annex config.
* git-annex-config --set/--unset: No longer change the local git config
setting, except for in the special case of annex.securehashesonly.
* Improve file ordering behavior when one parameter is "." and other
parameters are other directories.
* smudge bugfix: When annex.largefiles=anything, files that were already
stored in git, and have not been modified could sometimes be converted
to being stored in the annex. Changes in 7.20191024 made this more
of a problem. This case is now detected and prevented."""]]

View file

@ -89,7 +89,7 @@ If you've set up an annex.largefiles configuration but want to force a file to
be stored in the annex, you can temporarily override the configuration like
this:
git annex add -c annex.largefiles=anything smallfile
git annex add --force-large smallfile
## converting git to annexed
@ -97,7 +97,7 @@ When you have a file that is currently stored in git, and you want to
convert that to be stored in the annex, here's how to accomplish that:
git rm --cached file
git annex add -c annex.largefiles=anything file
git annex add --force-large file
git commit file
This first removes the file from git's index cache, and then adds it back
@ -111,7 +111,7 @@ convert that to be stored in git, here's how to accomplish that:
git annex unlock file
git rm --cached file
git -c annex.largefiles=nothing add file
git annex add --force-small file
git commit file
You can modify the file after unlocking it and before adding it to

View file

@ -0,0 +1,41 @@
[[!comment format=mdwn
username="joey"
subject="""comment 4"""
date="2019-12-30T18:15:40Z"
content="""
When -J is used, recent versions use a separate pool of worker threads for
the checksumming than the downloading. So even with -J1 checksum of the
previous download will not block the next download.
I've thought about making this the default without -J.. It relies on
concurrent-output working well, which it sometimes may not, eg when
filenames are not valid unicode, or perhaps on a non-ANSI terminal, and so
far it's been worth not defaulting to -J1 to avoid breaking in such edge
cases.
Anyway, it seems to me using -J should avoid most of the overhead, except
of course for the remaining checksumming after all downloads finish.
Incremental checksumming could be done for some of the built-in remotes,
but not others like bittorrent which write out of order. Some transfers
can resume, and the checksumming would have to somehow catch
up to resume point, which adds significant complexity.
External remotes would need to send the content over a pipe for incremental
checksumming, so it would need a protocol extension.
git-annex's remote API does have the concept that a remote can sufficiently
verify the content of a file during transfer that additional checksumming
is not necessary. Currently only used for git remotes when hard linking an
object from a sibling remote. I don't think it actually matters what
checksum a remote uses to do such verification, as long as it's
cryptographically secure and runs on the local machine.
A protocol extension that let an external remote communicate to git-annex
that it had done such verification at the end of transfer is worth thinking
about.
Re Ilya's security concerns, as long as the external remote runs the
verification on the local machine, it seems there is no added security
impact.
"""]]

View file

@ -0,0 +1,8 @@
[[!comment format=mdwn
username="Ilya_Shlyakhter"
avatar="http://cdn.libravatar.org/avatar/1647044369aa7747829c38b9dcc84df0"
subject="named pipes and external remotes"
date="2019-12-31T01:38:03Z"
content="""
\"External remotes would need to send the content over a pipe for incremental checksumming, so it would need a protocol extension\" -- in the current protocol, if you pass to the TRANSFER request, as the FILE parameter, a named pipe, would something break?
"""]]

View file

@ -0,0 +1,16 @@
Make `git-annex add --force-large` and `git-annex add --force-small`
add a specific file to annex or git, bypassing annex.largefiles
and all other configuration and state.
One reason to want this is that it avoids users doing stuff like this:
git -c annex.largefiles=anything annex add foo.c
Such a temporary setting of annex.largefiles can be problimatic, as explored in
<https://git-annex.branchable.com/bugs/A_case_where_file_tracked_by_git_unexpectedly_becomes_annex_pointer_file/>
Also, this could also be used to easily switch a file from one storage to
the other. I suppose the file would have to be touched first to make git-annex
add process it?
> [[done]] --[[Joey]]

View file

@ -0,0 +1,18 @@
[[!comment format=mdwn
username="joey"
subject="""comment 5"""
date="2019-12-26T20:34:02Z"
content="""
I've implemented annex.dotfiles in the `v8` branch.
I did not tie it to annex.largefiles in the end, spwhitton is right.
`git-annex add` behavior around dotfiles did change in a potentially
surprising way, since dotfiles are assumed to be non-large, they get added
to git. Users who have dotfiles that are large (or dotdirs containing large
files) will need to configure annex.largefiles and annex.dotfiles to avoid
those files being added to git. But, I don't think that will affect many
users, and it avoided a lot of complexity. At least such users can use this
and other semi-recent git-annex configs to avoid `git add` adding their
large dotfiles directly to git.
"""]]

View file

@ -1,5 +1,5 @@
Name: git-annex
Version: 8.20191219
Version: 8.20200101
Cabal-Version: >= 1.8
License: AGPL-3
Maintainer: Joey Hess <id@joeyh.name>
@ -296,7 +296,7 @@ source-repository head
custom-setup
Setup-Depends: base (>= 4.11.1.0), hslogger, split, unix-compat, process,
filepath, exceptions, bytestring, directory, IfElse, data-default,
filepath-bytestring (>= 1.4.2.1.1),
filepath-bytestring (>= 1.4.2.1.0),
utf8-string, transformers, Cabal
Executable git-annex
@ -321,7 +321,7 @@ Executable git-annex
directory (>= 1.2),
disk-free-space,
filepath,
filepath-bytestring (>= 1.4.2.1.1),
filepath-bytestring (>= 1.4.2.1.0),
IfElse,
hslogger,
monad-logger,

View file

@ -23,6 +23,7 @@ extra-deps:
- torrent-10000.1.1
- sandi-0.5
- http-client-0.5.14
- filepath-bytestring-1.4.2.1.1
explicit-setup-deps:
git-annex: true
resolver: lts-12.14