From df3be7308093f33fa96c7d0c8d4f03d0499b7e17 Mon Sep 17 00:00:00 2001 From: "http://peter-simons.myopenid.com/" Date: Thu, 9 Feb 2012 18:31:03 +0000 Subject: [PATCH 1/8] How to expire old versions of files that have been edited? --- ..._old_versions_of_files_that_have_been_edited__63__.mdwn | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__.mdwn diff --git a/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__.mdwn b/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__.mdwn new file mode 100644 index 0000000000..f06135c24e --- /dev/null +++ b/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__.mdwn @@ -0,0 +1,7 @@ +My annex contains several large files that I have unlocked, edited, and committed again, i.e. the annex contains the version history of those files. However, I don't want the history -- keeping the latest version is good enough for me. Running `git annex unused` won't detect those old versions, though, because they aren't unused as old Git revisions still refer to them. So I wonder: + +1. What is the best way to get rid of the old versions of files in the annex? + +2. What is the best way to detect old versions of files in the annex? + +I guess, I could run `git rebase -i` to squash commits to those files into one commit, thereby getting rid of the references to the old copies, but that approach feels awkward and error prone. Is anyone aware of a better way? From 4b4b887d8dcc997d0d93835f37ab48276d037da6 Mon Sep 17 00:00:00 2001 From: "http://peter-simons.myopenid.com/" Date: Thu, 9 Feb 2012 18:53:04 +0000 Subject: [PATCH 2/8] Added a comment --- .../comment_1_dccf4dc4483d08e5e2936b2cadeafeaf._comment | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_1_dccf4dc4483d08e5e2936b2cadeafeaf._comment diff --git a/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_1_dccf4dc4483d08e5e2936b2cadeafeaf._comment b/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_1_dccf4dc4483d08e5e2936b2cadeafeaf._comment new file mode 100644 index 0000000000..ee4fe2e6ce --- /dev/null +++ b/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_1_dccf4dc4483d08e5e2936b2cadeafeaf._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="http://peter-simons.myopenid.com/" + ip="77.186.179.173" + subject="comment 1" + date="2012-02-09T18:53:00Z" + content=""" +Sorry for commmenting on my own question ... I think I just figured out that `git annex unused` *does* in fact do what I want. When I tried it, it just didn't show the obsolete versions of the files I edited because I hadn't yet synchronized all repositories, so that was why the obsolete versions were still considered used. +"""]] From 4ccc01922fa9cbddeb56e2bfa40ddfd76fafee4b Mon Sep 17 00:00:00 2001 From: "http://joey.kitenet.net/" Date: Thu, 9 Feb 2012 19:42:28 +0000 Subject: [PATCH 3/8] Added a comment --- .../comment_2_5710294c1c8652c12b6df2233255a45e._comment | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_2_5710294c1c8652c12b6df2233255a45e._comment diff --git a/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_2_5710294c1c8652c12b6df2233255a45e._comment b/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_2_5710294c1c8652c12b6df2233255a45e._comment new file mode 100644 index 0000000000..576093a87f --- /dev/null +++ b/doc/forum/How_to_expire_old_versions_of_files_that_have_been_edited__63__/comment_2_5710294c1c8652c12b6df2233255a45e._comment @@ -0,0 +1,8 @@ +[[!comment format=mdwn + username="http://joey.kitenet.net/" + nickname="joey" + subject="comment 2" + date="2012-02-09T19:42:28Z" + content=""" +Yes, contents are still considered used while tags or refs refer to them. Including remote tracking branches like `remotes/origin/master` +"""]] From 9030f684521ce8db3e9cd6a4e2a10f4edce7bfee Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 Feb 2012 19:17:41 -0400 Subject: [PATCH 4/8] When checking that an url has a key, verify that the Content-Length, if available, matches the size of the key. If there's no Content-Length, or the key has no size, this check is not done, but it should happen most of the time, and protect against web content that has changed. --- Remote/Git.hs | 4 +++- Remote/Web.hs | 9 +++++---- Utility/Url.hs | 24 +++++++++++++++++++----- debian/changelog | 2 ++ git-annex.cabal | 2 +- 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/Remote/Git.hs b/Remote/Git.hs index 829ad1ccba..3905247755 100644 --- a/Remote/Git.hs +++ b/Remote/Git.hs @@ -28,6 +28,7 @@ import qualified Utility.Url as Url import Utility.TempFile import Config import Init +import Types.Key remote :: RemoteType remote = RemoteType { @@ -143,7 +144,8 @@ inAnnex r key where go e [] = return $ Left e go _ (u:us) = do - res <- catchMsgIO $ Url.exists u + res <- catchMsgIO $ + Url.check u (keySize key) case res of Left e -> go e us v -> return v diff --git a/Remote/Web.hs b/Remote/Web.hs index 49c3f43f3a..6bd04d4b15 100644 --- a/Remote/Web.hs +++ b/Remote/Web.hs @@ -15,6 +15,7 @@ import Annex.Content import Config import Logs.Web import qualified Utility.Url as Url +import Types.Key remote :: RemoteType remote = RemoteType { @@ -77,8 +78,8 @@ checkKey key = do us <- getUrls key if null us then return $ Right False - else return . Right =<< checkKey' us -checkKey' :: [URLString] -> Annex Bool -checkKey' us = untilTrue us $ \u -> do + else return . Right =<< checkKey' key us +checkKey' :: Key -> [URLString] -> Annex Bool +checkKey' key us = untilTrue us $ \u -> do showAction $ "checking " ++ u - liftIO $ Url.exists u + liftIO $ Url.check u (keySize key) diff --git a/Utility/Url.hs b/Utility/Url.hs index e10b8a92a4..efd6ad16dd 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -7,6 +7,7 @@ module Utility.Url ( URLString, + check, exists, canDownload, download, @@ -14,6 +15,7 @@ module Utility.Url ( ) where import Control.Applicative +import Control.Monad import qualified Network.Browser as Browser import Network.HTTP import Network.URI @@ -23,16 +25,28 @@ import Utility.Path type URLString = String -{- Checks that an url exists and could be successfully downloaded. -} -exists :: URLString -> IO Bool +{- Checks that an url exists and could be successfully downloaded, + - also checking that its size, if available, matches a specified size. -} +check :: URLString -> Maybe Integer -> IO Bool +check url expected_size = handle <$> exists url + where + handle (False, _) = False + handle (True, Nothing) = True + handle (True, s) = expected_size == s + +{- Checks that an url exists and could be successfully downloaded, + - also returning its size if available. -} +exists :: URLString -> IO (Bool, Maybe Integer) exists url = case parseURI url of - Nothing -> return False + Nothing -> return (False, Nothing) Just u -> do r <- request u HEAD case rspCode r of - (2,_,_) -> return True - _ -> return False + (2,_,_) -> return (True, size r) + _ -> return (False, Nothing) + where + size = liftM read . lookupHeader HdrContentLength . rspHeaders canDownload :: IO Bool canDownload = (||) <$> inPath "wget" <*> inPath "curl" diff --git a/debian/changelog b/debian/changelog index fdc909e3e1..36034f2ae6 100644 --- a/debian/changelog +++ b/debian/changelog @@ -18,6 +18,8 @@ git-annex (3.20120124) UNRELEASED; urgency=low location of the file. * addurl: Normalize badly encoded urls. * Fix teardown of stale cached ssh connections. + * When checking that an url has a key, verify that the Content-Length, + if available, matches the size of the key. -- Joey Hess Tue, 24 Jan 2012 16:21:55 -0400 diff --git a/git-annex.cabal b/git-annex.cabal index 3f152ea4b8..0c343e42c9 100644 --- a/git-annex.cabal +++ b/git-annex.cabal @@ -1,5 +1,5 @@ Name: git-annex -Version: 3.20120123 +Version: 3.20120124 Cabal-Version: >= 1.6 License: GPL Maintainer: Joey Hess From 17fed709c83de69c5bdf190b80eaa875fe6c9c7e Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 Feb 2012 19:23:23 -0400 Subject: [PATCH 5/8] addurl --fast: Verifies that the url can be downloaded (only getting its head), and records the size in the key. --- Backend/URL.hs | 8 ++++++-- Command/AddUrl.hs | 8 ++++++-- debian/changelog | 2 ++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Backend/URL.hs b/Backend/URL.hs index 6406095ca1..b3411bac5b 100644 --- a/Backend/URL.hs +++ b/Backend/URL.hs @@ -24,5 +24,9 @@ backend = Backend { fsckKey = Nothing } -fromUrl :: String -> Key -fromUrl url = stubKey { keyName = url, keyBackendName = "URL" } +fromUrl :: String -> Maybe Integer -> Key +fromUrl url size = stubKey + { keyName = url + , keyBackendName = "URL" + , keySize = size + } diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 496b9f2e8b..40e3a0e985 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -15,6 +15,7 @@ import qualified Backend import qualified Command.Add import qualified Annex import qualified Backend.URL +import qualified Utility.Url as Url import Annex.Content import Logs.Web import qualified Option @@ -55,7 +56,7 @@ perform url file = ifAnnexed file addurl geturl download :: String -> FilePath -> CommandPerform download url file = do showAction $ "downloading " ++ url ++ " " - let dummykey = Backend.URL.fromUrl url + let dummykey = Backend.URL.fromUrl url Nothing tmp <- fromRepo $ gitAnnexTmpLocation dummykey liftIO $ createDirectoryIfMissing True (parentDir tmp) stopUnless (downloadUrl [url] tmp) $ do @@ -70,7 +71,10 @@ download url file = do nodownload :: String -> FilePath -> CommandPerform nodownload url file = do - let key = Backend.URL.fromUrl url + (exists, size) <- liftIO $ Url.exists url + unless exists $ + error $ "unable to access url: " ++ url + let key = Backend.URL.fromUrl url size setUrlPresent key url next $ Command.Add.cleanup file key False diff --git a/debian/changelog b/debian/changelog index 36034f2ae6..f137972728 100644 --- a/debian/changelog +++ b/debian/changelog @@ -20,6 +20,8 @@ git-annex (3.20120124) UNRELEASED; urgency=low * Fix teardown of stale cached ssh connections. * When checking that an url has a key, verify that the Content-Length, if available, matches the size of the key. + * addurl --fast: Verifies that the url can be downloaded (only getting + its head), and records the size in the key. -- Joey Hess Tue, 24 Jan 2012 16:21:55 -0400 From a3ebf16e62e4499401165eebc8cf3d7123dc4fe7 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 Feb 2012 19:40:36 -0400 Subject: [PATCH 6/8] also verify new urls when adding them to existing files --- Command/AddUrl.hs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Command/AddUrl.hs b/Command/AddUrl.hs index 40e3a0e985..db73f14e93 100644 --- a/Command/AddUrl.hs +++ b/Command/AddUrl.hs @@ -19,6 +19,7 @@ import qualified Utility.Url as Url import Annex.Content import Logs.Web import qualified Option +import Types.Key def :: [Command] def = [withOptions [fileOption] $ @@ -50,6 +51,8 @@ perform url file = ifAnnexed file addurl geturl fast <- Annex.getState Annex.fast if fast then nodownload url file else download url file addurl (key, _backend) = do + unlessM (liftIO $ Url.check url (keySize key)) $ + error $ "failed to verify url: " ++ url setUrlPresent key url next $ return True From 6335abcab2c0b48132b04011acbd01fb99bc5b53 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 Feb 2012 20:40:18 -0400 Subject: [PATCH 7/8] doc update --- doc/tips/using_the_web_as_a_special_remote.mdwn | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/tips/using_the_web_as_a_special_remote.mdwn b/doc/tips/using_the_web_as_a_special_remote.mdwn index 8009927a49..a151f99332 100644 --- a/doc/tips/using_the_web_as_a_special_remote.mdwn +++ b/doc/tips/using_the_web_as_a_special_remote.mdwn @@ -8,10 +8,10 @@ The web can be used as a [[special_remote|special_remotes]] too. Now the file is downloaded, and has been added to the annex like any other file. So it can be renamed, copied to other repositories, and so on. -Note that git-annex assumes that, if the web site does not 404, the file is -still present on the web, and this counts as one [[copy|copies]] of the -file. So it will let you remove your last copy, trusting it can be -downloaded again: +Note that git-annex assumes that, if the web site does not 404, and has the +right file size, the file is still present on the web, and this counts as +one [[copy|copies]] of the file. So it will let you remove your last copy, +trusting it can be downloaded again: # git annex drop example.com_video.mpeg drop example.com_video.mpeg (checking http://example.com/video.mpeg) ok From ecfcb41abe5c2903ca80d26365afdf20faaf9989 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 10 Feb 2012 21:42:46 -0400 Subject: [PATCH 8/8] work around Network.Browser bug that converts a HEAD to a GET when following a redirect The code explicitly switches from HEAD to GET for most redirects. Possibly because someone misread a spec (which does require switching from POST to GET for 303 redirects). Or possibly because the spec really is that bad. Upstream bug: https://github.com/haskell/HTTP/issues/24 Since we absolutely don't want to download entire (large) files from the web when checking that they exist with HEAD, I wrote my own redirect follower, based closely on the one used by Network.Browser, but without this misfeature. Note that Network.Browser checks that the redirect url is a http url and fails if not. I don't, because I want to not need to change this code when it gets https support (related: I'm surprised to see it doesn't support https yet..). The check does not seem security significant; it doesn't support file:// urls for example. If a http url is redirected to https, the Network.Browser will actually make a http connection again. This could loop, but only up to 5 times. --- Utility/Url.hs | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/Utility/Url.hs b/Utility/Url.hs index efd6ad16dd..dfdebaf06a 100644 --- a/Utility/Url.hs +++ b/Utility/Url.hs @@ -19,6 +19,7 @@ import Control.Monad import qualified Network.Browser as Browser import Network.HTTP import Network.URI +import Data.Maybe import Utility.SafeCommand import Utility.Path @@ -87,12 +88,32 @@ get url = {- Makes a http request of an url. For example, HEAD can be used to - check if the url exists, or GET used to get the url content (best for - - small urls). -} + - small urls). + - + - This does its own redirect following because Browser's is buggy for HEAD + - requests. + -} request :: URI -> RequestMethod -> IO (Response String) -request url requesttype = Browser.browse $ do - Browser.setErrHandler ignore - Browser.setOutHandler ignore - Browser.setAllowRedirects True - snd <$> Browser.request (mkRequest requesttype url :: Request_String) +request url requesttype = go 5 url where + go :: Int -> URI -> IO (Response String) + go 0 _ = error "Too many redirects " + go n u = do + rsp <- Browser.browse $ do + Browser.setErrHandler ignore + Browser.setOutHandler ignore + Browser.setAllowRedirects False + snd <$> Browser.request (mkRequest requesttype u :: Request_String) + case rspCode rsp of + (3,0,x) | x /= 5 -> redir (n - 1) u rsp + _ -> return rsp ignore = const $ return () + redir n u rsp = do + case retrieveHeaders HdrLocation rsp of + [] -> return rsp + (Header _ newu:_) -> + case parseURIReference newu of + Nothing -> return rsp + Just newURI -> go n newURI_abs + where + newURI_abs = fromMaybe newURI (newURI `relativeTo` u)