From d5ee5fef658c73ca3c9efc9e89618f0b5220f378 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Mon, 18 Mar 2019 15:53:54 -0400 Subject: [PATCH] fsck: Detect situations where annex.thin has caused data loss to the content of locked files. In particular, when two files had the same content, and one was unlocked and modified, with annex.thin that can corrupt the content of the annex object, and so fsck on the other file should detect that. getKeyStatus was relying on Database.Keys.getAssociatedFiles to tell when a file is unlocked, but that can false positive because the database can list old associated files. Instead, separate out the case of unlocked object which has multiple hardlinks when annex.thin is in use. --- CHANGELOG | 2 + Command/Fsck.hs | 63 +++++++++++-------- Command/Migrate.hs | 2 +- ...ked_file_is_duplicated__47__truncated.mdwn | 2 + ..._82afe6abb793aef90e3ddc42c2e5c3f2._comment | 30 +++++++++ 5 files changed, 73 insertions(+), 26 deletions(-) create mode 100644 doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated/comment_3_82afe6abb793aef90e3ddc42c2e5c3f2._comment diff --git a/CHANGELOG b/CHANGELOG index a1f7d67604..a0726c378e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -29,6 +29,8 @@ git-annex (7.20190220) UNRELEASED; urgency=medium to support filenames starting with dashes. (To update the config of existing repositories, you can re-run git-annex init.) + * fsck: Detect situations where annex.thin has caused data loss + to the content of locked files. -- Joey Hess Wed, 20 Feb 2019 14:20:59 -0400 diff --git a/Command/Fsck.hs b/Command/Fsck.hs index 43a892cf25..11f9863312 100644 --- a/Command/Fsck.hs +++ b/Command/Fsck.hs @@ -1,6 +1,6 @@ {- git-annex command - - - Copyright 2010-2018 Joey Hess + - Copyright 2010-2019 Joey Hess - - Licensed under the GNU AGPL version 3 or higher. -} @@ -227,7 +227,7 @@ verifyLocationLog :: Key -> KeyStatus -> ActionItem -> Annex Bool verifyLocationLog key keystatus ai = do direct <- isDirect obj <- calcRepo $ gitAnnexLocation key - present <- if not direct && isKeyUnlocked keystatus + present <- if not direct && isKeyUnlockedThin keystatus then liftIO (doesFileExist obj) else inAnnex key u <- getUUID @@ -235,9 +235,10 @@ verifyLocationLog key keystatus ai = do {- Since we're checking that a key's object file is present, throw - in a permission fixup here too. -} when (present && not direct) $ do - void $ tryIO $ if isKeyUnlocked keystatus - then thawContent obj - else freezeContent obj + void $ tryIO $ case keystatus of + KeyUnlockedThin -> thawContent obj + KeyLockedThin -> thawContent obj + _ -> freezeContent obj unlessM (isContentWritePermOk obj) $ warning $ "** Unable to set correct write mode for " ++ obj ++ " ; perhaps you don't own that file" whenM (liftIO $ doesDirectoryExist $ parentDir obj) $ @@ -326,13 +327,11 @@ verifyAssociatedFiles key keystatus file = do forM_ fs $ \f -> unlessM (liftIO $ doesFileExist f) $ void $ Direct.removeAssociatedFile key f - goindirect = case keystatus of - KeyUnlocked -> do - f <- inRepo $ toTopFilePath file - afs <- Database.Keys.getAssociatedFiles key - unless (getTopFilePath f `elem` map getTopFilePath afs) $ - Database.Keys.addAssociatedFile key f - _ -> return () + goindirect = when (isKeyUnlockedThin keystatus) $ do + f <- inRepo $ toTopFilePath file + afs <- Database.Keys.getAssociatedFiles key + unless (getTopFilePath f `elem` map getTopFilePath afs) $ + Database.Keys.addAssociatedFile key f verifyWorkTree :: Key -> FilePath -> Annex Bool verifyWorkTree key file = do @@ -372,7 +371,7 @@ verifyWorkTree key file = do - Not checked when a file is unlocked, or in direct mode. -} checkKeySize :: Key -> KeyStatus -> ActionItem -> Annex Bool -checkKeySize _ KeyUnlocked _ = return True +checkKeySize _ KeyUnlockedThin _ = return True checkKeySize key _ ai = do file <- calcRepo $ gitAnnexLocation key ifM (liftIO $ doesFileExist file) @@ -450,7 +449,7 @@ checkBackend backend key keystatus afile = go =<< isDirect where go False = do content <- calcRepo $ gitAnnexLocation key - ifM (pure (isKeyUnlocked keystatus) <&&> (not <$> isUnmodified key content)) + ifM (pure (isKeyUnlockedThin keystatus) <&&> (not <$> isUnmodified key content)) ( nocheck , checkBackendOr badContent backend key content (mkActionItem afile) ) @@ -700,28 +699,42 @@ withFsckDb (StartIncremental h) a = a h withFsckDb NonIncremental _ = noop withFsckDb (ScheduleIncremental _ _ i) a = withFsckDb i a -data KeyStatus = KeyLocked | KeyUnlocked | KeyMissing +data KeyStatus + = KeyMissing + | KeyPresent + | KeyUnlockedThin + -- ^ An annex.thin worktree file is hard linked to the object. + | KeyLockedThin + -- ^ The object has hard links, but the file being fscked + -- is not the one that hard links to it. + deriving (Show) -isKeyUnlocked :: KeyStatus -> Bool -isKeyUnlocked KeyUnlocked = True -isKeyUnlocked KeyLocked = False -isKeyUnlocked KeyMissing = False +isKeyUnlockedThin :: KeyStatus -> Bool +isKeyUnlockedThin KeyUnlockedThin = True +isKeyUnlockedThin KeyLockedThin = False +isKeyUnlockedThin KeyPresent = False +isKeyUnlockedThin KeyMissing = False getKeyStatus :: Key -> Annex KeyStatus getKeyStatus key = ifM isDirect - ( return KeyUnlocked + ( return KeyUnlockedThin , catchDefaultIO KeyMissing $ do - unlocked <- not . null <$> Database.Keys.getAssociatedFiles key - return $ if unlocked then KeyUnlocked else KeyLocked + afs <- not . null <$> Database.Keys.getAssociatedFiles key + obj <- calcRepo $ gitAnnexLocation key + multilink <- ((> 1) . linkCount <$> liftIO (getFileStatus obj)) + return $ if multilink && afs + then KeyUnlockedThin + else KeyPresent ) getKeyFileStatus :: Key -> FilePath -> Annex KeyStatus getKeyFileStatus key file = do s <- getKeyStatus key case s of - KeyLocked -> catchDefaultIO KeyLocked $ + KeyUnlockedThin -> catchDefaultIO KeyUnlockedThin $ ifM (isJust <$> isAnnexLink file) - ( return KeyLocked - , return KeyUnlocked + ( return KeyLockedThin + , return KeyUnlockedThin ) _ -> return s + diff --git a/Command/Migrate.hs b/Command/Migrate.hs index dc0e32b932..0f9471b41d 100644 --- a/Command/Migrate.hs +++ b/Command/Migrate.hs @@ -70,7 +70,7 @@ perform file oldkey oldbackend newbackend = go =<< genkey (fastMigrate oldbacken go (Just (newkey, knowngoodcontent)) | knowngoodcontent = finish newkey | otherwise = stopUnless checkcontent $ finish newkey - checkcontent = Command.Fsck.checkBackend oldbackend oldkey Command.Fsck.KeyLocked afile + checkcontent = Command.Fsck.checkBackend oldbackend oldkey Command.Fsck.KeyPresent afile finish newkey = ifM (Command.ReKey.linkKey file oldkey newkey) ( do _ <- copyMetaData oldkey newkey diff --git a/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated.mdwn b/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated.mdwn index 663acc53cf..b989fee7dc 100644 --- a/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated.mdwn +++ b/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated.mdwn @@ -110,3 +110,5 @@ fsck ffoo ok ### Have you had any luck using git-annex before? (Sometimes we get tired of reading bug reports all day and a lil' positive end note does wonders) I'm hopeful! But just getting started :( +> git-annex fsck made to detect and clean up after this, which I consider +> sufficient, so [[done]] --[[Joey]] diff --git a/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated/comment_3_82afe6abb793aef90e3ddc42c2e5c3f2._comment b/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated/comment_3_82afe6abb793aef90e3ddc42c2e5c3f2._comment new file mode 100644 index 0000000000..e14adcc5ba --- /dev/null +++ b/doc/bugs/data_loss_from_dedup_when_v7_unlocked_file_is_duplicated__47__truncated/comment_3_82afe6abb793aef90e3ddc42c2e5c3f2._comment @@ -0,0 +1,30 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 3""" + date="2019-03-18T18:16:53Z" + content=""" +annex.thin does allow data loss to happen, and just because there's +another annex link in the repository that points at the same object +file does not mean data loss is not acceptable. + +While it might seem that, when adding the file with dup content, git-annex +could notice that the object file has been modified, and so replace it with +the new copy from ffoo, that would leave the same problem in +this equivilant situation: + + # echo hi > 1 + # echo hi > 2 + # git annex add 1 2 + # git annex unlock 1 + # echo bye > 1 + # cat 2 + bye + +The surprising thing to me is that `git annex fsck 2` (or ffoo in your +example) doesn't find any problem with it, despite it pointing at a +changed object file that doesn't have the right hash. + +Fsck doesn't want to complain about *expected* data loss when an unlocked +file has been modified and annex.thin caused the old version to be lost. +But, when fscking an annex symlink, that shouldn't apply. +"""]]