implemented mkImportableContentsVersioned

2019-04-19 13:39:33 -04:00 · 2019-04-19 13:39:33 -04:00 · 55a5d9679a
commit 55a5d9679a
parent 1968f6d9c6
2 changed files with 94 additions and 66 deletions
--- a/Remote/S3.hs
+++ b/Remote/S3.hs
@ -199,20 +199,20 @@ s3Setup' ss u mcreds c gc
 			writeUUIDFile archiveconfig u info
 		use archiveconfig info
 	
-	checkexportimportsafe c' info = 
-		checkexportimportsafe' c' info
-			=<< Annex.getState Annex.force
-	checkexportimportsafe' c' info force
-		| force = return ()
+	checkexportimportsafe c' info =
+		unlessM (Annex.getState Annex.force) $
+			checkexportimportsafe' c' info
+	checkexportimportsafe' c' info
 		| versioning info = return ()
-		| exportTree c' && importTree c' = giveup $ unwords
-			[ "Combining exporttree=yes and importtree=yes"
-			, "with an unversioned S3 bucket is not safe;"
-			, "exporting can overwrite other modifications"
-			, "to files in the bucket."
-			, "Recommend you add versioning=yes."
-			, "(Or use --force if you don't mind possibly losing data.)"
-			]
+		| otherwise = when (exportTree c' && importTree c') $
+			giveup $ unwords
+				[ "Combining exporttree=yes and importtree=yes"
+				, "with an unversioned S3 bucket is not safe;"
+				, "exporting can overwrite other modifications"
+				, "to files in the bucket."
+				, "Recommend you add versioning=yes."
+				, "(Or use --force if you don't mind possibly losing data.)"
+				]

 store :: S3HandleVar -> Remote -> S3Info -> Maybe Magic -> Storer
 store mh r info magic = fileStorer $ \k f p -> withS3HandleOrFail (uuid r) mh $ \h -> do
@ -479,7 +479,8 @@ listImportableContentsS3 hv r info =
 					}
 			liftIO $ print rsp'
 			continuelistunversioned h (rsp:l) rsp'
-		| otherwise = return (mklistunversioned (reverse (rsp:l)))
+		| otherwise = return $
+			mkImportableContentsUnversioned info (reverse (rsp:l))
 	
 	continuelistversioned h l rsp
 		| S3.gbovrIsTruncated rsp = do
@ -490,23 +491,71 @@ listImportableContentsS3 hv r info =
 					}
 			liftIO $ print rsp
 			continuelistversioned h (rsp:l) rsp'
-		| otherwise = return (mklistversioned (reverse (rsp:l)))
+		| otherwise = do
+			let v = mkImportableContentsVersioned info (reverse (rsp:l))
+			liftIO $ print v
+			return v
+
+mkImportableContentsUnversioned :: S3Info -> [S3.GetBucketResponse] -> ImportableContents (ContentIdentifier, ByteSize)
+mkImportableContentsUnversioned info l = ImportableContents 
+	{ importableContents = concatMap (mapMaybe extract . S3.gbrContents) l
+	, importableHistory = []
+	}
+  where
+	extract oi = do
+		loc <- bucketImportLocation info $
+			T.unpack $ S3.objectKey oi
+		let sz  = S3.objectSize oi
+		let cid = mkS3UnversionedContentIdentifier $
+			S3.objectETag oi
+		return (loc, (cid, sz))
+
+mkImportableContentsVersioned :: S3Info -> [S3.GetBucketObjectVersionsResponse] -> ImportableContents (ContentIdentifier, ByteSize)
+mkImportableContentsVersioned info = build . groupfiles
+  where
+	build [] = ImportableContents [] []
+	build l =
+		let (l', v) = latestversion l
+		in ImportableContents
+			{ importableContents = mapMaybe extract v
+			, importableHistory = case build l' of
+				ImportableContents [] [] -> []
+				h -> [h]
+			}
+  	
+	extract ovi@(S3.ObjectVersion {}) = do
+		loc <- bucketImportLocation info $
+			T.unpack $ S3.oviKey ovi
+		let sz  = S3.oviSize ovi
+		let cid = mkS3UnversionedContentIdentifier $
+			S3.oviETag ovi
+		return (loc, (cid, sz))
+	extract (S3.DeleteMarker {}) = Nothing
 	
-	mklistunversioned l = ImportableContents 
-		{ importableContents =
-			concatMap (mapMaybe go . S3.gbrContents) l
-		, importableHistory = []
-		}
+	-- group files so all versions of a file are in a sublist,
+	-- with the newest first. S3 uses such an order, so it's just a
+	-- matter of breaking up the response list into sublists.
+	groupfiles = groupBy (\a b -> S3.oviKey a == S3.oviKey b) 
+		. concatMap S3.gbovrContents
+
+	latestversion [] = ([], [])
+	latestversion ([]:rest) = latestversion rest
+	latestversion l@((first:_old):remainder) =
+		go (S3.oviLastModified first) [first] remainder
 	  where
-	  	go oi = do
-			loc <- bucketImportLocation info $
-				T.unpack $ S3.objectKey oi
-			let sz  = S3.objectSize oi
-			let cid = mkS3UnversionedContentIdentifier $
-				S3.objectETag oi
-			return (loc, (cid, sz))
+		go mtime c [] = (removemostrecent mtime l, reverse c)
+		go mtime c ([]:rest) = go mtime c rest
+		go mtime c ((latest:_old):rest) = 
+			let !mtime' = max mtime (S3.oviLastModified latest)
+			in go mtime' (latest:c) rest
 	
-	mklistversioned l = ImportableContents [] [] -- FIXME
+	removemostrecent _ [] = []
+	removemostrecent mtime ([]:rest) = removemostrecent mtime rest
+	removemostrecent mtime (i@(curr:old):rest)
+		| S3.oviLastModified curr == mtime =
+			old : removemostrecent mtime rest
+		| otherwise =
+			i : removemostrecent mtime rest

 retrieveExportWithContentIdentifierS3 :: S3HandleVar -> S3Info -> ExportLocation -> ContentIdentifier -> FilePath -> Annex (Maybe Key) -> MeterUpdate -> Annex (Maybe Key)
 retrieveExportWithContentIdentifierS3 hv info loc cid dest mkkey p = undefined
@ -977,13 +1026,15 @@ mkS3VersionedContentIdentifier :: S3VersionID -> ContentIdentifier
 mkS3VersionedContentIdentifier (S3VersionID _ v) = 
 	ContentIdentifier $ T.encodeUtf8 v

+-- S3 returns etags surrounded by double quotes, and the quotes are
+-- included here.
 type S3Etag = T.Text

 -- For an unversioned bucket, the S3Etag is instead used as the
 -- ContentIdentifier.
 mkS3UnversionedContentIdentifier :: S3Etag -> ContentIdentifier
 mkS3UnversionedContentIdentifier t =
-	ContentIdentifier $ T.encodeUtf8 t
+	ContentIdentifier $ T.encodeUtf8 $ T.filter (/= '"') t

 setS3VersionID :: S3Info -> UUID -> Key -> Maybe S3VersionID -> Annex ()
 setS3VersionID info u k vid
--- a/doc/todo/import_tree.mdwn
+++ b/doc/todo/import_tree.mdwn
@ -237,49 +237,26 @@ But l needs to be the same as the l above to avoid git trees proliferation.

 What is the algorythm here?

-It's got two parts, the first finds the current file tree that's
-in the bucket:
+1. Build a list of files with historical versions ([[a]]). 
+2. Extract a snapshot from the list
+3. Remove too new versions from the list
+4. Recurse with the new list.

-1. Remove the first item from the list, and add it to the file tree.
-   (This is the most recently changed item.)
-   (If the item is a deletion, remove from list but don't add anything to
-   file tree.)
-2. Skip forward past past versions of the file from #1 to another file.
-3. Go through the rest of the list, and the first time a file is seen, add
-   it to the file tree. (Unless it's a deletion.)
-   Don't remove these from the list, unless their
-   modification time is the same as the modification time of the item in
-   #1.
+Extracting a snapshot:

-The second part takes the remaining list from the first part, and
-recursively generates past file trees:
+Map over the list, taking the head version of each item and tracking
+the most recent modification time. Add the filenames to a snapshot list
+(unless the item is a deletion).

-1. Find the most recently modified item in the list.
-2. Remove the most recently modified item from the list, and add it to the
-   file tree.
-   (If the item is a deletion, remove from list but don't add anything to
-   file tree.)
-3. Skip forward past past versions of the file from #1 to another file.
-4. Go through the rest of the list, and the first time a file is seen, add
-   it to the file tree. (Unless it's a deletion.)
-   Don't remove these from the list, unless their
-   modification time is the same as the modification time of the item in
-   #1.
-5. The file tree now corresponds to the most recent past version of the S3
-   bucket, so make a ImportableContents that has it as the
-   importableContents. For the importableHistory, recurse this function
-   again, with the remaining contents of the list.
+Removing too new versions:

-The only expensive op here is finding the most recently modified item
-in the list. There are only two possibilities for where that is in the
-list:
+Map over the list, and when the head version of a file matches the most
+recent modification time, pop it off.

-1. It may be the first item in the list.
-2. It may be the first mention of some other file than the first
-   one mentioned in the list.
+This results in a list that is only versions before the snapshot.

-So that only needs a small scan forward to the next file,
-and a single date comparison.
+Overall this is perhaps a bit better than O(n^2) because the size of the list
+decreases as it goes?

 ---