Merge branch 'master' into watch

This commit is contained in:
Joey Hess 2012-06-20 13:15:59 -04:00
commit 483b1b08c6
24 changed files with 225 additions and 48 deletions

View file

@ -12,7 +12,7 @@ module Annex.CatFile (
catFileHandle
) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Lazy as L
import Common.Annex
import qualified Git

View file

@ -10,8 +10,7 @@
module Command.Unused where
import qualified Data.Set as S
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Encoding as L
import qualified Data.ByteString.Lazy as L
import Data.BloomFilter
import Data.BloomFilter.Easy
import Data.BloomFilter.Hash
@ -265,8 +264,9 @@ withKeysReferencedInGitRef a ref = do
go [] = noop
go (l:ls)
| isSymLink (LsTree.mode l) = do
content <- L.decodeUtf8 <$> catFile ref (LsTree.file l)
case fileKey (takeFileName $ L.unpack content) of
content <- encodeW8 . L.unpack
<$> catFile ref (LsTree.file l)
case fileKey (takeFileName content) of
Nothing -> go ls
Just k -> do
a k

View file

@ -26,7 +26,7 @@ module Crypto (
prop_hmacWithCipher_sane
) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Lazy as L
import Data.ByteString.Lazy.UTF8 (fromString)
import Data.Digest.Pure.SHA
import Control.Applicative

View file

@ -15,8 +15,8 @@ module Git.CatFile (
) where
import System.IO
import qualified Data.ByteString.Char8 as S
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Common
import Git

View file

@ -7,7 +7,7 @@
module Remote.Bup (remote) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M
import System.Process

View file

@ -7,8 +7,8 @@
module Remote.Directory (remote) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Char8 as S
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as S
import qualified Data.Map as M
import qualified Control.Exception as E

View file

@ -7,7 +7,7 @@
module Remote.Hook (remote) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M
import System.Exit
import System.Environment

View file

@ -7,7 +7,7 @@
module Remote.Rsync (remote) where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as M
import Common.Annex

View file

@ -13,6 +13,8 @@ import Foreign.C
import System.IO
import System.IO.Unsafe
import qualified Data.Hash.MD5 as MD5
import Data.Word
import Data.Bits.Utils
{- Sets a Handle to use the filesystem encoding. This causes data
- written or read from it to be encoded/decoded the same
@ -29,7 +31,7 @@ withFilePath :: FilePath -> (CString -> IO a) -> IO a
withFilePath fp f = Encoding.getFileSystemEncoding
>>= \enc -> GHC.withCString enc fp f
{- Encodes a FilePath into a Str, applying the filesystem encoding.
{- Encodes a FilePath into a Md5.Str, applying the filesystem encoding.
-
- This use of unsafePerformIO is belived to be safe; GHC's interface
- only allows doing this conversion with CStrings, and the CString buffer
@ -41,3 +43,15 @@ encodeFilePath :: FilePath -> MD5.Str
encodeFilePath fp = MD5.Str $ unsafePerformIO $ do
enc <- Encoding.getFileSystemEncoding
GHC.withCString enc fp $ GHC.peekCString Encoding.char8
{- Converts a [Word8] to a FilePath, encoding using the filesystem encoding.
-
- w82c produces a String, which may contain Chars that are invalid
- unicode. From there, this is really a simple matter of applying the
- file system encoding, only complicated by GHC's interface to doing so.
-}
{-# NOINLINE encodeW8 #-}
encodeW8 :: [Word8] -> FilePath
encodeW8 w8 = unsafePerformIO $ do
enc <- Encoding.getFileSystemEncoding
GHC.withCString Encoding.char8 (w82s w8) $ GHC.peekCString enc

View file

@ -7,7 +7,7 @@
module Utility.Gpg where
import qualified Data.ByteString.Lazy.Char8 as L
import qualified Data.ByteString.Lazy as L
import System.Posix.Types
import Control.Applicative
import Control.Concurrent

1
debian/changelog vendored
View file

@ -5,6 +5,7 @@ git-annex (3.20120616) UNRELEASED; urgency=low
need to manually run git commands when manipulating files.
Available on Linux, BSDs, and OSX!
* Enable diskfree on kfreebsd, using statvfs.
* unused: Fix crash when key names contain invalid utf8.
-- Joey Hess <joeyh@debian.org> Tue, 12 Jun 2012 11:35:59 -0400

View file

@ -0,0 +1,15 @@
What steps will reproduce the problem?
I don't know exactly when it started
What is the expected output? What do you see instead?
When I run git annex unused I get
unused . (checking for unused data...) (checking master...) git-annex: Cannot decode byte '\xb4': Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream
Most likely I have added some file with a strange encoding that git-annex can't decode. The problem is that the unused process aborts because of this.
What version of git-annex are you using? On what operating system?
3.20120522, Debian testing
> I've just fixed this bug in git, will be in the next release. --[[Joey]]
> [[done]]

View file

@ -0,0 +1,8 @@
[[!comment format=mdwn
username="http://joeyh.name/"
ip="4.154.2.6"
subject="comment 1"
date="2012-06-20T14:30:27Z"
content="""
Try running `git annex unused --debug`; this will tell us the git command that's outputing the data it cannot process. Then you can try running that git command and see what the problem filename is.
"""]]

View file

@ -0,0 +1,8 @@
[[!comment format=mdwn
username="http://joeyh.name/"
ip="4.154.2.6"
subject="comment 2"
date="2012-06-20T14:34:23Z"
content="""
Your `locale` setting may also be relevant. FWIW, I've tried to create a file with `\xb4` in its name and have not gotten git-annex unused to crash on it.
"""]]

View file

@ -0,0 +1,17 @@
[[!comment format=mdwn
username="https://www.google.com/accounts/o8/id?id=AItOawnXgp-iIaBK5pnk22xqMVERQb97VyXaejs"
nickname="Kristian"
subject="comment 3"
date="2012-06-20T14:37:09Z"
content="""
This is what happens when I add the debug parameter
git annex unused --debug
unused . (checking for unused data...) git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"ls-files\",\"--cached\",\"-z\",\"--\",\"/home/kristian/AnnexMedia\"]
git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"show-ref\"]
(checking master...) git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"ls-tree\",\"--full-tree\",\"-z\",\"-r\",\"--\",\"refs/heads/master\"]
git [\"--git-dir=/home/kristian/AnnexMedia/.git\",\"--work-tree=/home/kristian/AnnexMedia\",\"cat-file\",\"--batch\"]
git-annex: Cannot decode byte '\xb4': Data.Text.Encoding.decodeUtf8: Invalid UTF-8 stream
"""]]

View file

@ -0,0 +1,10 @@
[[!comment format=mdwn
username="http://joeyh.name/"
ip="4.154.2.6"
subject="comment 4"
date="2012-06-20T14:49:09Z"
content="""
Ah, reproduced it; need to use the WORM backend and have the file present in another branch..
"""]]

View file

@ -0,0 +1,19 @@
[[!comment format=mdwn
username="https://www.google.com/accounts/o8/id?id=AItOawnXgp-iIaBK5pnk22xqMVERQb97VyXaejs"
nickname="Kristian"
subject="comment 5"
date="2012-06-20T14:55:33Z"
content="""
I checkout out the git annex branch and using
find * | grep -P \"[\xb4]\"
I found a file
43e/b16/WORM-s4118528-m1245167306--Jerry Lee Lewis - Whole Lotta Shakin\302\264 Going\302\264 On.mp3.log
The corresponding file also existed in the master branch (as a link).
I moved both these files to a folder outside my repository and synched my git-annex branch with by master server. I still get the same error. Is there any other place where information about this file is stored?
"""]]

View file

@ -0,0 +1,10 @@
[[!comment format=mdwn
username="http://joeyh.name/"
ip="4.154.2.6"
subject="comment 6"
date="2012-06-20T16:59:53Z"
content="""
git-annex was not crashing due to content in the git-annex branch, but due to a symlink in one of your regular git branches, probably master and origin/master.
This bug is fixed in git master, if you need the fix before the next release.
"""]]

View file

@ -0,0 +1,37 @@
Running the tip of the watch branch on OSX in an annex'ed directory.
The watch command detects the changes, does _something_, see the output below.
Output from watch command
<pre>
(Recording state in git...)
Added "./KeePass2.18.dmg"
Added "./KeePassX-0.4.3.dmg"
add ./KeePass2.18.dmg (checksum...) ok
add ./KeePassX-0.4.3.dmg (checksum...) ok
</pre>
State of the annex
<pre>
laplace:annex jtang$ git status
# On branch master
# Untracked files:
# (use "git add <file>..." to include in what will be committed)
#
# KeePass2.18.dmg
# KeePassX-0.4.3.dmg
nothing added to commit but untracked files present (use "git add" to track)
</pre>
It seems to not do a git add and commit after the creation of the symlinks, manually doing this makes it all happy again till more files are added.
note: i had posted a comment in the blog post, but posting the issue here is probably more appropriate.
> Yeah, this is the issue I was struggling with last night.
> I think it's fixed in 57cf65eb6d811ba7fd19eb62a54e3b83a0c2dfa7,
> but the kqueue watch still needs a lot of work. --[[Joey]]
>> Confirmed this is fixed, but do note the known kqueue bugs in
>> [[design/assistant/inotify]]! [[done]] --[[Joey]]

View file

@ -0,0 +1,34 @@
Good news! My beta testers report that the new kqueue code works on OSX.
At least "works" as well as it does on Debian kFreeBSD. My crazy
development strategy of developing on Debian kFreeBSD while targeting Mac
OSX is vindicated. ;-)
So, I've been beating the kqueue code into shape for the last 12 hours,
minus a few hours sleep.
First, I noticed it was seeming to starve the other threads. I'm using
Haskell's non-threaded runtime, which does cooperative multitasking between
threads, and my C code was never returning to let the other threads run.
Changed that around, so the C code runs until SIGALARMed, and then that
thread calls `yield` before looping back into the C code. Wow, cooperative
multitasking.. I last dealt with that when programming for Windows 3.1!
(Should try to use Haskell's -threaded runtime sometime, but git-annex
doesn't work under it, and I have not tried to figure out why not.)
Then I made a [single commit](http://source.git-annex.branchable.com/?p=source.git;a=commitdiff;h=2bfcc0b09c5dd37c5e0ab65cb089232bfcc31934),
with no testing, in which I made the kqueue code maintain a cache of what
it expects in the directory tree, and use that to determine what files
changed how when a change is detected. Serious code. It worked on the
first go. If you were wondering why I'm writing in Haskell ... yeah,
that's why.
And I've continued to hammer on the kqueue code, making lots of little
fixes, and at this point it seems *almost* able to handle the changes I
throw at it. It does have one big remaining problem; kqueue doesn't tell me
when a writer closes a file, so it will sometimes miss adding files. To fix
this, I'm going to need to make it maintain a queue of new files, and
periodically check them, with `lsof`, to see when they're done being
written to, and add them to the annex. So while a file is being written
to, `git annex watch` will have to wake up every second or so, and run
`lsof` ... and it'll take it at least 1 second to notice a file's complete.
Not ideal, but the best that can be managed with kqueue.

View file

@ -13,6 +13,14 @@ There is a `watch` branch in git that adds the command.
* When you `git annex unlock` a file, it will immediately be re-locked.
* With kqueue, if a file is created and still has a writer, it'll
give up adding it, and it will never get added. This is because kqueue
cannot track file closes. Need to go back and check these files every
second or something.
* Kqueue has to open every directory it watches, so too many directories
will run it out of the max number of open files (typically 1024), and fail.
## beyond Linux
I'd also like to support OSX and if possible the BSDs.
@ -58,40 +66,8 @@ I'd also like to support OSX and if possible the BSDs.
* Windows has a Win32 ReadDirectoryChangesW, and perhaps other things.
## beyond Linux
I'd also like to support OSX and if possible the BSDs.
* kqueue ([haskell bindings](http://hackage.haskell.org/package/kqueue))
is supported by FreeBSD, OSX, and other BSDs.
From what I can find, kqueue does not provide full directory watching
capabilities. To watch a file, you have to have an open file descriptor
to the file. This wouldn't scale.
Gamin does the best it can with just kqueue, supplimented by polling.
The source file `server/gam_kqueue.c` makes for interesting reading.
Using gamin to do the heavy lifting is one option.
([haskell bindings](http://hackage.haskell.org/package/hlibfam) for FAM;
gamin shares the API)
* hfsevents ([haskell bindings](http://hackage.haskell.org/package/hfsevents))
is OSX specific.
Originally it was only directory level, and you were only told a
directory had changed and not which file. Based on the haskell
binding's code, from OSX 10.7.0, file level events were added.
This will be harder for me to develop for, since I don't have access to
OSX machines..
* Windows has a Win32 ReadDirectoryChangesW, and perhaps other things.
## todo
- Support OSes other than Linux; it only uses inotify currently.
OSX and FreeBSD use the same mechanism, and there is a Haskell interface
for it,
- Run niced and ioniced? Seems to make sense, this is a background job.
- configurable option to only annex files meeting certian size or
filename criteria

View file

@ -0,0 +1,14 @@
I think it would be useful to supplement the `reinject` command with an automatic
mode which calculates the checksum of the source file and injects the file if it
is known to the repository (without the need to provide a destination filename).
In addition, this could be done recursively if the user provides a directory to
inject. All this can probably be done already with some plumbing, but a simple
`reinject --auto` (or `scour`, or `scavenge`, if you like) would be a nice addition.
Of course this would only work for the checksum backends.
Example use cases would be:
* Recovering data from lost+found easily
* Making use of old (pre-git-annex) archival volumes with useful files
scattered among non-useful files
* Sneaker-netting files between disconnected git-annex repositories

View file

@ -31,4 +31,4 @@ if [ "$?" = 1 ]; then
fi
</pre>
It's also using the branches-local script for sorting and prioritising the branches to build, this branches-local script can be found at the [autobuild-ceph](https://github.com/ceph/autobuild-ceph/blob/master/branches-local) repository. If there are other people interested in setting up their own instances of gitbuilder for git-annex, please let me know and I will setup an aggregator page to collect status of the builds. The builder runs and updates the webpage every 30mins.
It's also using the branches-local script for sorting and prioritising the branches to build, this branches-local script can be found at the [autobuild-ceph](https://github.com/ceph/autobuild-ceph/blob/master/branches-local) repository. If there are other people interested in setting up their own instances of gitbuilder for git-annex, please let me know and I will setup an aggregator page to collect status of the builds. The builder runs and updates on a very regular basis.

View file

@ -0,0 +1,14 @@
[[!comment format=mdwn
username="http://www.davidhaslem.com/"
nickname="David"
subject="comment 7"
date="2012-06-19T04:41:27Z"
content="""
$(brew --prefix) should, in most cases, be /usr/local. That's the recommended install location for homebrew.
I already had git installed and homebrew as my package manager - my install steps were as follows:
1. brew install haskell-platform ossp-uuid md5sha1sum coreutils pcre
2. PATH=\"$(brew --prefix coreutils)/libexec/gnubin:$PATH\" cabal install git-annex
"""]]