implemented mkImportableContentsVersioned

This commit is contained in:
Joey Hess 2019-04-19 13:39:33 -04:00
parent 1968f6d9c6
commit 55a5d9679a
No known key found for this signature in database
GPG key ID: DB12DB0FF05F8F38
2 changed files with 94 additions and 66 deletions

View file

@ -200,19 +200,19 @@ s3Setup' ss u mcreds c gc
use archiveconfig info
checkexportimportsafe c' info =
checkexportimportsafe' c' info
=<< Annex.getState Annex.force
checkexportimportsafe' c' info force
| force = return ()
unlessM (Annex.getState Annex.force) $
checkexportimportsafe' c' info
checkexportimportsafe' c' info
| versioning info = return ()
| exportTree c' && importTree c' = giveup $ unwords
[ "Combining exporttree=yes and importtree=yes"
, "with an unversioned S3 bucket is not safe;"
, "exporting can overwrite other modifications"
, "to files in the bucket."
, "Recommend you add versioning=yes."
, "(Or use --force if you don't mind possibly losing data.)"
]
| otherwise = when (exportTree c' && importTree c') $
giveup $ unwords
[ "Combining exporttree=yes and importtree=yes"
, "with an unversioned S3 bucket is not safe;"
, "exporting can overwrite other modifications"
, "to files in the bucket."
, "Recommend you add versioning=yes."
, "(Or use --force if you don't mind possibly losing data.)"
]
store :: S3HandleVar -> Remote -> S3Info -> Maybe Magic -> Storer
store mh r info magic = fileStorer $ \k f p -> withS3HandleOrFail (uuid r) mh $ \h -> do
@ -479,7 +479,8 @@ listImportableContentsS3 hv r info =
}
liftIO $ print rsp'
continuelistunversioned h (rsp:l) rsp'
| otherwise = return (mklistunversioned (reverse (rsp:l)))
| otherwise = return $
mkImportableContentsUnversioned info (reverse (rsp:l))
continuelistversioned h l rsp
| S3.gbovrIsTruncated rsp = do
@ -490,23 +491,71 @@ listImportableContentsS3 hv r info =
}
liftIO $ print rsp
continuelistversioned h (rsp:l) rsp'
| otherwise = return (mklistversioned (reverse (rsp:l)))
| otherwise = do
let v = mkImportableContentsVersioned info (reverse (rsp:l))
liftIO $ print v
return v
mklistunversioned l = ImportableContents
{ importableContents =
concatMap (mapMaybe go . S3.gbrContents) l
, importableHistory = []
}
mkImportableContentsUnversioned :: S3Info -> [S3.GetBucketResponse] -> ImportableContents (ContentIdentifier, ByteSize)
mkImportableContentsUnversioned info l = ImportableContents
{ importableContents = concatMap (mapMaybe extract . S3.gbrContents) l
, importableHistory = []
}
where
extract oi = do
loc <- bucketImportLocation info $
T.unpack $ S3.objectKey oi
let sz = S3.objectSize oi
let cid = mkS3UnversionedContentIdentifier $
S3.objectETag oi
return (loc, (cid, sz))
mkImportableContentsVersioned :: S3Info -> [S3.GetBucketObjectVersionsResponse] -> ImportableContents (ContentIdentifier, ByteSize)
mkImportableContentsVersioned info = build . groupfiles
where
build [] = ImportableContents [] []
build l =
let (l', v) = latestversion l
in ImportableContents
{ importableContents = mapMaybe extract v
, importableHistory = case build l' of
ImportableContents [] [] -> []
h -> [h]
}
extract ovi@(S3.ObjectVersion {}) = do
loc <- bucketImportLocation info $
T.unpack $ S3.oviKey ovi
let sz = S3.oviSize ovi
let cid = mkS3UnversionedContentIdentifier $
S3.oviETag ovi
return (loc, (cid, sz))
extract (S3.DeleteMarker {}) = Nothing
-- group files so all versions of a file are in a sublist,
-- with the newest first. S3 uses such an order, so it's just a
-- matter of breaking up the response list into sublists.
groupfiles = groupBy (\a b -> S3.oviKey a == S3.oviKey b)
. concatMap S3.gbovrContents
latestversion [] = ([], [])
latestversion ([]:rest) = latestversion rest
latestversion l@((first:_old):remainder) =
go (S3.oviLastModified first) [first] remainder
where
go oi = do
loc <- bucketImportLocation info $
T.unpack $ S3.objectKey oi
let sz = S3.objectSize oi
let cid = mkS3UnversionedContentIdentifier $
S3.objectETag oi
return (loc, (cid, sz))
go mtime c [] = (removemostrecent mtime l, reverse c)
go mtime c ([]:rest) = go mtime c rest
go mtime c ((latest:_old):rest) =
let !mtime' = max mtime (S3.oviLastModified latest)
in go mtime' (latest:c) rest
mklistversioned l = ImportableContents [] [] -- FIXME
removemostrecent _ [] = []
removemostrecent mtime ([]:rest) = removemostrecent mtime rest
removemostrecent mtime (i@(curr:old):rest)
| S3.oviLastModified curr == mtime =
old : removemostrecent mtime rest
| otherwise =
i : removemostrecent mtime rest
retrieveExportWithContentIdentifierS3 :: S3HandleVar -> S3Info -> ExportLocation -> ContentIdentifier -> FilePath -> Annex (Maybe Key) -> MeterUpdate -> Annex (Maybe Key)
retrieveExportWithContentIdentifierS3 hv info loc cid dest mkkey p = undefined
@ -977,13 +1026,15 @@ mkS3VersionedContentIdentifier :: S3VersionID -> ContentIdentifier
mkS3VersionedContentIdentifier (S3VersionID _ v) =
ContentIdentifier $ T.encodeUtf8 v
-- S3 returns etags surrounded by double quotes, and the quotes are
-- included here.
type S3Etag = T.Text
-- For an unversioned bucket, the S3Etag is instead used as the
-- ContentIdentifier.
mkS3UnversionedContentIdentifier :: S3Etag -> ContentIdentifier
mkS3UnversionedContentIdentifier t =
ContentIdentifier $ T.encodeUtf8 t
ContentIdentifier $ T.encodeUtf8 $ T.filter (/= '"') t
setS3VersionID :: S3Info -> UUID -> Key -> Maybe S3VersionID -> Annex ()
setS3VersionID info u k vid

View file

@ -237,49 +237,26 @@ But l needs to be the same as the l above to avoid git trees proliferation.
What is the algorythm here?
It's got two parts, the first finds the current file tree that's
in the bucket:
1. Build a list of files with historical versions ([[a]]).
2. Extract a snapshot from the list
3. Remove too new versions from the list
4. Recurse with the new list.
1. Remove the first item from the list, and add it to the file tree.
(This is the most recently changed item.)
(If the item is a deletion, remove from list but don't add anything to
file tree.)
2. Skip forward past past versions of the file from #1 to another file.
3. Go through the rest of the list, and the first time a file is seen, add
it to the file tree. (Unless it's a deletion.)
Don't remove these from the list, unless their
modification time is the same as the modification time of the item in
#1.
Extracting a snapshot:
The second part takes the remaining list from the first part, and
recursively generates past file trees:
Map over the list, taking the head version of each item and tracking
the most recent modification time. Add the filenames to a snapshot list
(unless the item is a deletion).
1. Find the most recently modified item in the list.
2. Remove the most recently modified item from the list, and add it to the
file tree.
(If the item is a deletion, remove from list but don't add anything to
file tree.)
3. Skip forward past past versions of the file from #1 to another file.
4. Go through the rest of the list, and the first time a file is seen, add
it to the file tree. (Unless it's a deletion.)
Don't remove these from the list, unless their
modification time is the same as the modification time of the item in
#1.
5. The file tree now corresponds to the most recent past version of the S3
bucket, so make a ImportableContents that has it as the
importableContents. For the importableHistory, recurse this function
again, with the remaining contents of the list.
Removing too new versions:
The only expensive op here is finding the most recently modified item
in the list. There are only two possibilities for where that is in the
list:
Map over the list, and when the head version of a file matches the most
recent modification time, pop it off.
1. It may be the first item in the list.
2. It may be the first mention of some other file than the first
one mentioned in the list.
This results in a list that is only versions before the snapshot.
So that only needs a small scan forward to the next file,
and a single date comparison.
Overall this is perhaps a bit better than O(n^2) because the size of the list
decreases as it goes?
---