From dad4be97c2057db1ef3a13bb983d1701a90c9069 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Thu, 22 Oct 2020 12:57:58 -0400 Subject: [PATCH] speculatively use remote's configured chunk size as a fallback When a special remote has chunking enabled, but no chunk sizes are recorded (or the recorded ones are not found), speculatively try chunks using the configured chunk size. This makes eg, git-annex fsck --from remote be able to fix up the location log of a file that the git-annex branch does not indicate is stored on the remote. Note that fsck does *not* fix up the chunk log to indicate the chunk size. So, changing the chunk config of the remote after that will still prevent accessing the chunks stored on it. Maybe fsck should, but I wanted to start with this and see if it's needed. --- CHANGELOG | 3 ++ Remote/Helper/Chunked.hs | 46 +++++++++++++++---- ..._5d11bb90105b4d0cf14f85e6d8795023._comment | 26 +++++++++++ ..._f48e4d3571a8fc2b45fbaad7a9f8fb60._comment | 29 ++++++++++++ 4 files changed, 94 insertions(+), 10 deletions(-) create mode 100644 doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_1_5d11bb90105b4d0cf14f85e6d8795023._comment create mode 100644 doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_2_f48e4d3571a8fc2b45fbaad7a9f8fb60._comment diff --git a/CHANGELOG b/CHANGELOG index 28c787cc16..0d585c1ae5 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,9 @@ git-annex (8.20201008) UNRELEASED; urgency=medium * move: Improve resuming a move that was interrupted after the object was transferred, in cases where numcopies checks prevented the resumed move from dropping the object from the source repository. + * When a special remote has chunking enabled, but no chunk sizes are + recorded (or the recorded ones are not found), speculatively try + chunks using the configured chunk size. -- Joey Hess Thu, 08 Oct 2020 10:48:17 -0400 diff --git a/Remote/Helper/Chunked.hs b/Remote/Helper/Chunked.hs index b90e69f820..22030fb39f 100644 --- a/Remote/Helper/Chunked.hs +++ b/Remote/Helper/Chunked.hs @@ -242,7 +242,7 @@ retrieveChunks retriever u chunkconfig encryptor basek dest basep sink -- looking in the git-annex branch for chunk counts -- that are likely not there. getunchunked `catchNonAsync` - (\e -> go (Just e) =<< chunkKeysOnly u basek) + (\e -> go (Just e) =<< chunkKeysOnly u chunkconfig basek) | otherwise = go Nothing =<< chunkKeys u chunkconfig basek where go pe ls = do @@ -350,10 +350,11 @@ checkPresentChunks checker u chunkconfig encryptor basek -- looking in the git-annex branch for chunk counts -- that are likely not there. v <- check basek + let getchunkkeys = chunkKeysOnly u chunkconfig basek case v of Right True -> return True - Left e -> checklists (Just e) =<< chunkKeysOnly u basek - _ -> checklists Nothing =<< chunkKeysOnly u basek + Left e -> checklists (Just e) =<< getchunkkeys + _ -> checklists Nothing =<< getchunkkeys | otherwise = checklists Nothing =<< chunkKeys u chunkconfig basek where checklists Nothing [] = return False @@ -388,16 +389,41 @@ checkPresentChunks checker u chunkconfig encryptor basek - This finds all possible lists of keys that might be on the remote that - can be combined to get back the requested key, in order from most to - least likely to exist. + - + - Speculatively tries chunks using the ChunkConfig last of all + - (when that's not the same as the recorded chunks). This can help + - recover from data loss, where the chunk log didn't make it out, + - though only as long as the ChunkConfig is unchanged. -} chunkKeys :: UUID -> ChunkConfig -> Key -> Annex [[Key]] -chunkKeys u chunkconfig k = do - l <- chunkKeysOnly u k - return $ if noChunks chunkconfig - then [k] : l - else l ++ [[k]] +chunkKeys = chunkKeys' False -chunkKeysOnly :: UUID -> Key -> Annex [[Key]] -chunkKeysOnly u k = map (toChunkList k) <$> getCurrentChunks u k +{- Same as chunkKeys, but excluding the unchunked key. -} +chunkKeysOnly :: UUID -> ChunkConfig -> Key -> Annex [[Key]] +chunkKeysOnly = chunkKeys' True + +chunkKeys' :: Bool -> UUID -> ChunkConfig -> Key -> Annex [[Key]] +chunkKeys' onlychunks u chunkconfig k = do + recorded <- getCurrentChunks u k + let recordedl = map (toChunkList k) recorded + return $ addspeculative recorded $ if onlychunks + then recordedl + else if noChunks chunkconfig + then [k] : recordedl + else recordedl ++ [[k]] + where + addspeculative recorded l = case chunkconfig of + NoChunks -> l + UnpaddedChunks chunksz -> case fromKey keySize k of + Nothing -> l + Just keysz -> + let (d, m) = keysz `divMod` fromIntegral chunksz + chunkcount = d + if m == 0 then 0 else 1 + v = (FixedSizeChunks chunksz, chunkcount) + in if v `elem` recorded + then l + else l ++ [toChunkList k v] + LegacyChunks _ -> l toChunkList :: Key -> (ChunkMethod, ChunkCount) -> [Key] toChunkList k (FixedSizeChunks chunksize, chunkcount) = diff --git a/doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_1_5d11bb90105b4d0cf14f85e6d8795023._comment b/doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_1_5d11bb90105b4d0cf14f85e6d8795023._comment new file mode 100644 index 0000000000..5fc15354d9 --- /dev/null +++ b/doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_1_5d11bb90105b4d0cf14f85e6d8795023._comment @@ -0,0 +1,26 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 1""" + date="2020-10-22T16:09:17Z" + content=""" +Note that what you are trying to do will only work if the special remote +is not encrypted. + +As well as your use case, which seems very unusual, I think one other use +case would be if a clone uploaded to the special remote, but never synced +out its git-annex branch before being lost, and fsck --from +remote is being run in another clone to reconstruct it. Currently it +won't try chunks as none are recorded. + +Speculatively trying the current remote's chunk config would handle the +majority of cases, though wouldn't help if the other clone had adjusted the +special remote's chunk size too. + +There's some overhead, but it can check it last, and not check it if +it's in the list of known chunks, so the overhead would only usually +be paid if the content git-annex expected to be present had gone missing, +which I think is rare enough to not care about. + +(Also, this can only be done when the size of the key is known, so not +eg addurl --relaxed keys.) +"""]] diff --git a/doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_2_f48e4d3571a8fc2b45fbaad7a9f8fb60._comment b/doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_2_f48e4d3571a8fc2b45fbaad7a9f8fb60._comment new file mode 100644 index 0000000000..5924108ca3 --- /dev/null +++ b/doc/bugs/fsck_--key_without___34__chunking__34___information_in_git-annex_does_not_try_chunks/comment_2_f48e4d3571a8fc2b45fbaad7a9f8fb60._comment @@ -0,0 +1,29 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 2""" + date="2020-10-22T17:00:25Z" + content=""" +Implemented that. But.. + +As implemented, there's nothing to make the chunk size get stored in the +chunk log for a key, after it accesses its content using the configured +chunk size. + +So, changing the chunk= of the remote can prevent accessing content that +was accessible before. Of course, avoiding that is why chunk sizes are +logged in the first place. + +Seems like maybe fsck --from should fix the chunk log? I think +fsck would always need to be used, to fix up the location log, before any +other commands rely on the data being in the special remote, so it seems +fine to only fix the chunk log there. + +But, also a bit unclear how fsck would find out when it needs to do this. +It only needs to when the remote's configured chunk size is not +listed in the chunk log. But that's also common after changing the chunk +size of a remote. So it would have to mess around with checking the +presence of chunk keys itself, which would be extra work and also ugly +to implement. + +I'm leaving this todo^Wbug open for now due to this. +"""]]