use export db to correctly handle duplicate files

Removed uncorrect UniqueKey key in db schema; a key can appear multiple
times with different files.

The database has to be flushed after each removal. But when adding files
to the export, lots of changes are able to be queued up w/o flushing.
So it's still fairly efficient.

If large removals of files from exports are too slow, an alternative
would be to make two passes over the diff, one pass queueing deletions
from the database, then a flush and the a second pass updating the
location log. But that would use more memory, and need to look up
exportKey twice per removed file, so I've avoided such optimisation yet.

This commit was supported by the NSF-funded DataLad project.
This commit is contained in:
Joey Hess 2017-09-04 14:33:09 -04:00
parent 656797b4e8
commit 4da763439b
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
5 changed files with 31 additions and 32 deletions

View file

@ -113,21 +113,20 @@ seek o = do
startExport :: Remote -> ExportHandle -> Git.LsTree.TreeItem -> CommandStart
startExport r db ti = do
ek <- exportKey (Git.LsTree.sha ti)
liftIO $ addExportLocation db (asKey ek) loc
stopUnless (notElem (uuid r) <$> loggedLocations (asKey ek)) $ do
stopUnless (liftIO $ notElem loc <$> getExportLocation db (asKey ek)) $ do
showStart "export" f
next $ performExport r ek (Git.LsTree.sha ti) loc
next $ performExport r db ek (Git.LsTree.sha ti) loc
where
loc = ExportLocation $ toInternalGitPath f
f = getTopFilePath $ Git.LsTree.file ti
performExport :: Remote -> ExportKey -> Sha -> ExportLocation -> CommandPerform
performExport r ek contentsha loc = do
performExport :: Remote -> ExportHandle -> ExportKey -> Sha -> ExportLocation -> CommandPerform
performExport r db ek contentsha loc = do
let storer = storeExport $ exportActions r
sent <- case ek of
AnnexKey k -> ifM (inAnnex k)
( metered Nothing k $ \m -> do
let rollback = void $ performUnexport r ek loc
let rollback = void $ performUnexport r db ek loc
sendAnnex k rollback
(\f -> storer f k loc m)
, do
@ -142,11 +141,12 @@ performExport r ek contentsha loc = do
liftIO $ hClose h
storer tmp sha1k loc m
if sent
then next $ cleanupExport r ek
then next $ cleanupExport r db ek loc
else stop
cleanupExport :: Remote -> ExportKey -> CommandCleanup
cleanupExport r ek = do
cleanupExport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandCleanup
cleanupExport r db ek loc = do
liftIO $ addExportLocation db (asKey ek) loc
logChange (asKey ek) (uuid r) InfoPresent
return True
@ -154,23 +154,28 @@ startUnexport :: Remote -> ExportHandle -> Git.DiffTree.DiffTreeItem -> CommandS
startUnexport r db diff
| Git.DiffTree.srcsha diff /= nullSha = do
showStart "unexport" f
oldk <- exportKey (Git.DiffTree.srcsha diff)
liftIO $ removeExportLocation db (asKey oldk) loc
next $ performUnexport r oldk loc
ek <- exportKey (Git.DiffTree.srcsha diff)
next $ performUnexport r db ek loc
| otherwise = stop
where
loc = ExportLocation $ toInternalGitPath f
f = getTopFilePath $ Git.DiffTree.file diff
performUnexport :: Remote -> ExportKey -> ExportLocation -> CommandPerform
performUnexport r ek loc = do
performUnexport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandPerform
performUnexport r db ek loc = do
let remover = removeExport $ exportActions r
ok <- remover (asKey ek) loc
if ok
then next $ cleanupUnexport r ek
then next $ cleanupUnexport r db ek loc
else stop
cleanupUnexport :: Remote -> ExportKey -> CommandCleanup
cleanupUnexport r ek = do
logChange (asKey ek) (uuid r) InfoMissing
cleanupUnexport :: Remote -> ExportHandle -> ExportKey -> ExportLocation -> CommandCleanup
cleanupUnexport r db ek loc = do
liftIO $ do
removeExportLocation db (asKey ek) loc
-- Flush so that getExportLocation sees this and any
-- other removals of the key.
flushDbQueue db
whenM (liftIO $ null <$> getExportLocation db (asKey ek)) $
logChange (asKey ek) (uuid r) InfoMissing
return True

View file

@ -16,6 +16,7 @@ module Database.Export (
closeDb,
addExportLocation,
removeExportLocation,
flushDbQueue,
getExportLocation,
ExportedId,
) where
@ -37,7 +38,6 @@ Exported
key IKey
file SFilePath
KeyFileIndex key file
UniqueKey key
|]
{- Opens the database, creating it if it doesn't exist yet. -}
@ -74,7 +74,10 @@ removeExportLocation h k (ExportLocation f) = queueDb h $
ik = toIKey k
ef = toSFilePath f
{- Doesn't know about recently queued changes. -}
flushDbQueue :: ExportHandle -> IO ()
flushDbQueue (ExportHandle h) = H.flushDbQueue h
{- Note that this does not see recently queued changes. -}
getExportLocation :: ExportHandle -> Key -> IO [ExportLocation]
getExportLocation (ExportHandle h) k = H.queryDbQueue h $ do
l <- select $ from $ \r -> do

View file

@ -159,6 +159,7 @@ unVerified a = do
-- The FilePath will be relative, and may contain unix-style path
-- separators.
newtype ExportLocation = ExportLocation FilePath
deriving (Eq)
data ExportActions a = ExportActions
{ exportSupported :: a Bool

View file

@ -147,7 +147,8 @@ remotes, don't let it be turned off.
The same file contents may be in a treeish multiple times under different
filenames. That complicates using location tracking. One file may have been
exported and the other not, and location tracking says that the content
is present in the export.
is present in the export. A sqlite database is needed to keep track of
this.
## recording exported filenames in git-annex branch

View file

@ -18,17 +18,6 @@ there need to be a new interface in supported remotes?
Work is in progress. Todo list:
* Use retrieveExport when getting from export remotes.
(Needs a map from key to ExportLocation)
* Efficient handling of renames.
* If the same content is present in two different files, export
location tracking can be messed up.
When one of the files is deleted and
that tree is exported, the location log for the key will be updated
to say it's not present, even though the other file is still present.
And, once one of the files is uploaded, the location log will
say the content is present, so the pass over the tree won't try to
upload the other file. (See design for a fix for this.)
* Support export to aditional special remotes (S3 etc)
* Support export to external special remotes.