Commit graph

34923 commits

Author SHA1 Message Date
Joey Hess
8baa43ee12
tried a blind alley on streaming special remote download via proxy
This didn't work. In case I want to revisit, here's what I tried.

diff --git a/Annex/Proxy.hs b/Annex/Proxy.hs
index 48222872c1..e4e526d3dd 100644
--- a/Annex/Proxy.hs
+++ b/Annex/Proxy.hs
@@ -26,16 +26,21 @@ import Logs.UUID
 import Logs.Location
 import Utility.Tmp.Dir
 import Utility.Metered
+import Utility.ThreadScheduler
+import Utility.OpenFd
 import Git.Types
 import qualified Database.Export as Export

 import Control.Concurrent.STM
 import Control.Concurrent.Async
+import Control.Concurrent.MVar
 import qualified Data.ByteString as B
+import qualified Data.ByteString as BS
 import qualified Data.ByteString.Lazy as L
 import qualified System.FilePath.ByteString as P
 import qualified Data.Map as M
 import qualified Data.Set as S
+import System.IO.Unsafe

 proxyRemoteSide :: ProtocolVersion -> Bypass -> Remote -> Annex RemoteSide
 proxyRemoteSide clientmaxversion bypass r
@@ -240,21 +245,99 @@ proxySpecialRemote protoversion r ihdl ohdl owaitv oclosedv mexportdb = go
 		writeVerifyChunk iv h b
 		storetofile iv h (n - fromIntegral (B.length b)) bs

-	proxyget offset af k = withproxytmpfile k $ \tmpfile -> do
+	proxyget offset af k = withproxytmpfile k $ \tmpfile ->
+		let retrieve = tryNonAsync $ Remote.retrieveKeyFile r k af
+			(fromRawFilePath tmpfile) nullMeterUpdate vc
+		in case fromKey keySize k of
+			Just size | size > 0 -> do
+				cancelv <- liftIO newEmptyMVar
+				donev <- liftIO newEmptyMVar
+				streamer <- liftIO $ async $
+					streamdata offset tmpfile size cancelv donev
+				retrieve >>= \case
+					Right _ -> liftIO $ do
+						putMVar donev ()
+						wait streamer
+					Left err -> liftIO $ do
+						putMVar cancelv ()
+						wait streamer
+						propagateerror err
+			_ -> retrieve >>= \case
+				Right _ -> liftIO $ senddata offset tmpfile
+				Left err -> liftIO $ propagateerror err
+	  where
 		-- Don't verify the content from the remote,
 		-- because the client will do its own verification.
-		let vc = Remote.NoVerify
-		tryNonAsync (Remote.retrieveKeyFile r k af (fromRawFilePath tmpfile) nullMeterUpdate vc) >>= \case
-			Right _ -> liftIO $ senddata offset tmpfile
-			Left err -> liftIO $ propagateerror err
+		vc = Remote.NoVerify

+	streamdata (Offset offset) f size cancelv donev = do
+		sendlen offset size
+		waitforfile
+		x <- tryNonAsync $ do
+			fd <- openFdWithMode f ReadOnly Nothing defaultFileFlags
+			h <- fdToHandle fd
+			hSeek h AbsoluteSeek offset
+			senddata' h (getcontents size)
+		case x of
+			Left err -> do
+				throwM err
+			Right res -> return res
+	  where
+		-- The file doesn't exist at the start.
+		-- Wait for some data to be written to it as well,
+		-- in case an empty file is first created and then
+		-- overwritten. When there is an offset, wait for
+		-- the file to get that large. Note that this is not used
+		-- when the size is 0.
+		waitforfile = tryNonAsync (fromIntegral <$> getFileSize f) >>= \case
+			Right sz | sz > 0 && sz >= offset -> return ()
+			_ -> ifM (isEmptyMVar cancelv)
+				( do
+					threadDelaySeconds (Seconds 1)
+					waitforfile
+				, do
+					return ()
+				)
+
+		getcontents n h = unsafeInterleaveIO $ do
+			isdone <- isEmptyMVar donev <||> isEmptyMVar cancelv
+			c <- BS.hGet h defaultChunkSize
+			let n' = n - fromIntegral (BS.length c)
+			let c' = L.fromChunks [BS.take (fromIntegral n) c]
+			if BS.null c
+				then if isdone
+					then return mempty
+					else do
+						-- Wait for more data to be
+						-- written to the file.
+						threadDelaySeconds (Seconds 1)
+						getcontents n h
+				else if n' > 0
+					then do
+						-- unsafeInterleaveIO causes
+						-- this to be deferred until
+						-- data is read from the lazy
+						-- ByteString.
+						cs <- getcontents n' h
+						return $ L.append c' cs
+					else return c'
+
 	senddata (Offset offset) f = do
 		size <- fromIntegral <$> getFileSize f
-		let n = max 0 (size - offset)
-		sendmessage $ DATA (Len n)
+		sendlen offset size
 		withBinaryFile (fromRawFilePath f) ReadMode $ \h -> do
 			hSeek h AbsoluteSeek offset
-			sendbs =<< L.hGetContents h
+			senddata' h L.hGetContents
+
+	senddata' h getcontents = do
+			sendbs =<< getcontents h
 			-- Important to keep the handle open until
 			-- the client responds. The bytestring
 			-- could still be lazily streaming out to
@@ -272,6 +355,11 @@ proxySpecialRemote protoversion r ihdl ohdl owaitv oclosedv mexportdb = go
 				Just FAILURE -> return ()
 				Just _ -> giveup "protocol error"
 				Nothing -> return ()
+
+	sendlen offset size = do
+		let n = max 0 (size - offset)
+		sendmessage $ DATA (Len n)
+

 {- Check if this repository can proxy for a specified remote uuid,
  - and if so enable proxying for it. -}
2024-10-07 15:12:09 -04:00
Spencer
cb196337f4 additional question of spaces in URL 2024-10-07 19:10:19 +00:00
Spencer
abd56608cf 2024-10-07 19:02:17 +00:00
matrss
f650627b23 2024-10-07 14:40:19 +00:00
matrss
b0a6301cde Added a comment 2024-10-07 14:12:23 +00:00
Joey Hess
b501d23f9b
update 2024-10-07 10:06:12 -04:00
matrss
6b6ec39997 2024-10-07 13:59:56 +00:00
sng@353ca358075d9aa328f60a5439a3cee10f8301fe
b57677251b Added a comment 2024-10-06 21:42:13 +00:00
matrss
19f7b0e7d4 2024-10-02 15:07:54 +00:00
matrss
470bd1f441 2024-10-02 14:51:58 +00:00
matrss
4a794ce0ba 2024-10-02 14:42:37 +00:00
yarikoptic
13580427c8 filing an issue on yt-dlp not used for some reason 2024-10-01 21:01:40 +00:00
Joey Hess
f3403e9691
add news item for git-annex 10.20240927 2024-09-30 19:16:06 -04:00
brendan.ward@a2e11ad27f6b2fa2c556aea6811496e0d95dd0da
191e84d82a 2024-09-30 20:54:14 +00:00
mike@2d6d71f56ce2a992244350475251df87c26fe351
7b5dda33e0 removed 2024-09-27 12:18:59 +00:00
mike@2d6d71f56ce2a992244350475251df87c26fe351
39e02528f0 Added a comment: corruption using git-annex-remote-rclone 2024-09-27 12:18:41 +00:00
mike@2d6d71f56ce2a992244350475251df87c26fe351
82538a9cd3 Added a comment: corruption using git-annex-remote-rclone 2024-09-27 07:39:06 +00:00
Joey Hess
99236376e7
sim: document interruption and concurrency issues
Does not seem worth doing a lot of locking and detection of these
problems.
2024-09-26 12:26:47 -04:00
Joey Hess
783e910d0c
sim: Add metadata command
Only really needed for completeness, preferred content expressions can
match against metadata.
2024-09-26 12:20:37 -04:00
Joey Hess
b492eb051b
heading 2024-09-25 14:54:55 -04:00
Joey Hess
253f2325fb
remove example, which didn't format right in mdwn 2024-09-25 14:54:21 -04:00
Joey Hess
df7045c2e4
formatting 2024-09-25 14:53:46 -04:00
Joey Hess
854fcf9619
formatting 2024-09-25 14:50:17 -04:00
Joey Hess
49c3e1d8f3
formatting 2024-09-25 14:49:48 -04:00
Joey Hess
6a95e4edad
sim: support "--" as comment
Using this in my sim files that are also mdwn files to avoid comments
being displayed as headers.
2024-09-25 14:47:32 -04:00
Joey Hess
6f084524bd
Merge branch 'sim' 2024-09-25 14:42:27 -04:00
Joey Hess
d026e585be
update 2024-09-25 14:29:37 -04:00
Joey Hess
431499e4ff
fix tab damage that broke examples formatting in man page
When did vim default to expandtabs for mdwn? No.
2024-09-25 14:23:04 -04:00
Joey Hess
8e94b75a61
support simulating clusters
Without actually simulating cluster implementation at all. Instead, only
the essential fact that cluster gateways know what changes they have
made to each node of a cluster. That is enough for sims like
sizebalanced_cluster.
2024-09-25 14:06:41 -04:00
Joey Hess
61c95f4d29
design for simulating clusters w/o simulating cluster gateways 2024-09-25 12:58:53 -04:00
Joey Hess
b9214d4162
Revert "sim: add commands for cluster management"
This reverts commit 344141da63.

Rethinking this
2024-09-25 12:11:03 -04:00
Joey Hess
85418d6c72
update 2024-09-25 12:10:55 -04:00
Joey Hess
344141da63
sim: add commands for cluster management
Clusters are not actually simulated yet.
2024-09-25 11:48:22 -04:00
nobodyinperson
e15b8769e0 Added a comment: Re: default preferred content 2024-09-25 09:25:42 +00:00
nadir
e22272129e 2024-09-25 06:41:27 +00:00
Joey Hess
540bd5e1ab
sim: added run subcommand
And a nice sim of random preferred content expressions.
2024-09-24 12:06:34 -04:00
Joey Hess
9571162057
sim: add stepstable 2024-09-24 11:50:24 -04:00
Joey Hess
4ed58d7894
sim: random preferred content expression generation 2024-09-24 11:23:23 -04:00
Joey Hess
7cc4312695
fix state overwrite bug
I have needed to excercise a lot of care in threading st through, and I
got it wrong here. Probably using a state monad would be a good idea.
2024-09-24 10:00:38 -04:00
adehnert
ec59cb526f Added a comment: Settable default preferred content? 2024-09-24 00:02:21 +00:00
Joey Hess
76fa43e882
update test case for bug
after recent changes broke the test case

the other bug I cannot reproduce though
2024-09-23 16:05:11 -04:00
Joey Hess
969e6c2747
sped up sim step by about 200%
Noticed that it was quite slow compared with things like action
sendwanted. Guessed that the slowdown is largely due to every step
doing a simulated git pull/push.

So, rather than always doing a pull/push, only do those when no actions
are found without doing a pull/push.

This does mean that step will sometimes experience a split brain
situation, but that seems like a good thing? Because step ought to
explore as many possible scenarios as it reasonably can.
2024-09-23 15:45:47 -04:00
Joey Hess
6df101f8b4
added sim of sizebalanced in a splitbrain situation 2024-09-23 15:04:52 -04:00
Joey Hess
5a4bee24b8
fix sizebalanced empty size bug
Fix bug that prevented anything being stored in an empty repository whose
preferred content expression uses sizebalanced.
2024-09-23 14:30:18 -04:00
Joey Hess
1aacf7ece4
adds sims collection 2024-09-23 13:43:55 -04:00
Joey Hess
7bc8c2bfeb
sim visit as first-class command
Allows using it in a sim file.
2024-09-23 13:09:35 -04:00
Joey Hess
6cf9a101b8
sim: Fix size tracking for balanced preferred content 2024-09-23 12:42:32 -04:00
Joey Hess
a6b8082119
update 2024-09-23 09:38:56 -04:00
AaronBrooks
edc02432ef removed 2024-09-22 22:21:32 +00:00
AaronBrooks
8857265224 Added a comment: reinject files -- more efficiently 2024-09-22 22:21:05 +00:00
AaronBrooks
6ee1a98071 Added a comment: reinject files -- more efficiently 2024-09-22 22:19:13 +00:00
Joey Hess
2daa8a8f21
puzzling bug 2024-09-20 16:53:40 -04:00
Joey Hess
19b966f0fd
sim: better step
On each step, find all the actions that could be done, and pick one of them
to do.

Should detect stability, but that is broken.
2024-09-20 15:23:34 -04:00
Joey Hess
24b3aed84a
update 2024-09-20 11:59:35 -04:00
Joey Hess
fd24d0d66f
update 2024-09-20 11:26:40 -04:00
Joey Hess
7c10d6846c
update 2024-09-20 11:05:57 -04:00
Joey Hess
f061ae92fb
sim: implement addtree 2024-09-20 10:34:52 -04:00
Joey Hess
5e51e7c339
comment 2024-09-18 09:08:42 -04:00
Joey Hess
29d8429779
sim: tested concurrency over actions
This demonstrates concurrent behavior that looks right. And with a
random seed, the results are deterministic.

init foo
init bar
init backup
connect foo <-> bar
connect foo <-> backup
addmulti 10 testfiles 1mb 1gb foo backup
action foo gitpull backup
wanted foo nothing
wanted bar anything
wanted backup anything
action bar gitpull foo
action foo dropunwanted while action bar getwanted foo
2024-09-17 14:39:53 -04:00
Joey Hess
6751f23978
sim: fix get bug
When getting from a remote, have to check that the repo doing the
getting thinks the remote contains the key, but also that the remote
actually does. Before this bug fix, it would get from a repo that used
to have the key, but that had dropped it since the last git pull.
2024-09-17 14:29:49 -04:00
Joey Hess
02f0996e25
git-annex sim log 2024-09-17 13:43:11 -04:00
Joey Hess
b85965cb3c
sim: implement dropunwantedfrom 2024-09-17 13:35:35 -04:00
Joey Hess
eb5fad4e79
fix ActionDropUnwanted
Now tested working
2024-09-17 11:55:57 -04:00
Joey Hess
4c7db31c20
addmulti 2024-09-17 11:22:14 -04:00
Joey Hess
2a16796a1c
move pull/push/sync into getSimActionComponents
As well as being a more pleasing implementation than I managed
yesterday, this allows for those actions to be run concurrently in the
sim.
2024-09-17 10:54:44 -04:00
Joey Hess
7d27a8ea1a
sim concurrency 2024-09-17 10:37:22 -04:00
Joey Hess
3b7e3cb2f4
add 2024-09-17 08:31:55 -04:00
Joey Hess
c420ec9364
sim: add action repo sync command 2024-09-16 16:48:21 -04:00
Joey Hess
52891711d2
git-annex sim command is working
Had to add Read instances to Key and NumCopies and some other similar
types. I only expect to use those in serializing a sim. Of course, this
risks that implementation changes break reading old data. For a sim,
that would not be a big problem.
2024-09-12 16:10:52 -04:00
mike@2d6d71f56ce2a992244350475251df87c26fe351
a2895c2dac Added a comment 2024-09-12 15:40:24 +00:00
nobodyinperson
f8d1022db0 Added a comment: 👍 +1 for encrypting the annex on regular git remotes 2024-09-12 14:51:20 +00:00
Joey Hess
7e8274c6b7
implemented ActionDropUnwanted
Not tested yet. This emulates the same checking that is done when
dropping. Note that when dropping from a special remote it is not able
to make a locked copy.
2024-09-12 10:44:31 -04:00
m.szczepanik@8dd0314f20fa09be99ee3903d1c04a80eafbd849
3a03ed42e6 2024-09-12 12:13:06 +00:00
mike@2d6d71f56ce2a992244350475251df87c26fe351
0f2754ec3c Added a comment 2024-09-12 05:22:18 +00:00
yarikoptic
28d207bc57 initial report on that addunlocked is not respected during import 2024-09-11 20:47:45 +00:00
Joey Hess
f381b457f2
sim file parser and generator
The generator doesn't emit the best possible connect commands,
but it does output something valid. Eg, an input like:

connect A <-> B <-> C <-> D

becomes:

connect A <-> B <-> C
connect C <-> D

Also:

connect A -> B <- C

becomes:

connect A -> B
connect C -> B

Which could be improved.

Also disconnect commands are not prettified at all, but probably there's
no reason to.
2024-09-11 15:59:13 -04:00
Joey Hess
84bbbeae9d
started on sim file parser 2024-09-11 11:53:25 -04:00
Joey Hess
64466d8687
add action command to git-annex sim
step just picks a random action, and this allows finer control over what
happens in the sim
2024-09-09 16:06:45 -04:00
Joey Hess
a2c0d5e4a9
finish updateSimRepoState
Converted maps to use UUID as key.

Also added mincopies to the sim.
2024-09-09 09:37:59 -04:00
Joey Hess
811dd95453
maxsize of 0 to disable 2024-09-09 09:32:43 -04:00
Joey Hess
def8095e5f
rethought sim a bit 2024-09-06 12:53:20 -04:00
yarikoptic
578abf7b89 initial report on incorrect handling of empty files in adjusted branches mode 2024-09-06 14:01:34 +00:00
Joey Hess
d717e9aca0
Merge branch 'master' of ssh://git-annex.branchable.com 2024-09-05 15:25:34 -04:00
yarikoptic
f0aa5ddf3e Added a comment 2024-09-05 14:52:51 +00:00
yarikoptic
3d0dc4a91d Added a comment: ping on this issue : how to recover? 2024-09-05 14:49:07 +00:00
Joey Hess
ed740bc31e
comment 2024-09-05 09:20:38 -04:00
Joey Hess
84c781d924
documentation for git-annex sim
command not implemented yet
2024-09-04 15:03:17 -04:00
tapesafer
6412c19127 Added a comment: PS 2024-09-04 15:48:01 +00:00
Joey Hess
00e3531169
update 2024-09-04 11:36:46 -04:00
tapesafer
2c458d7116 Added a comment: numcopies & force-trusting is ignored by fsck on readonly directory remotes? 2024-09-04 14:50:16 +00:00
Rick
3f2957d0e4 Added a comment: Similar Borg sync issue 2024-09-03 19:40:57 +00:00
Joey Hess
1b6c33a38e
update 2024-09-03 14:24:32 -04:00
Joey Hess
3398514c38
sim design 2024-09-03 14:23:48 -04:00
Joey Hess
fe71400e37
fix typo 2024-09-03 14:23:14 -04:00
Joey Hess
340bdd0dac
treat "not present" in preferred content as invalid
Detect when a preferred content expression contains "not present", which
would lead to repeatedly getting and then dropping files, and make it never
match. This also applies to "not balanced" and "not sizebalanced".

--explain will tell the user when this happens

Note that getMatcher calls matchMrun' and does not check for unstable
negated limits. While there is no --present anyway, if there was,
it would not make sense for --not --present to complain about
instability and fail to match.
2024-09-03 13:50:06 -04:00
Joey Hess
03864a2c3b
update 2024-09-03 11:52:54 -04:00
Joey Hess
b800ea6826
2 level toc 2024-09-02 16:32:28 -04:00
Joey Hess
ab0c82114b
Merge branch 'master' of ssh://git-annex.branchable.com 2024-09-02 16:31:31 -04:00
Joey Hess
1e1c13dd38
fix number of headers 2024-09-02 16:31:03 -04:00
lucas.gautheron@f2b5c93a64b028c1ec8698b9c2412ed51ff22040
850ea3a9b8 2024-09-02 15:12:02 +00:00
lucas.gautheron@f2b5c93a64b028c1ec8698b9c2412ed51ff22040
925c203c09 2024-09-02 15:08:25 +00:00
Joey Hess
9d29b99ac4
add news item for git-annex 10.20240831 2024-08-31 19:50:36 -04:00
Joey Hess
698d9252a5
mention sizebalanced as well as balanced 2024-08-30 12:06:45 -04:00
Joey Hess
53b7375cc6
update 2024-08-30 11:14:45 -04:00
Joey Hess
54b6151412
document using balanced preferred content in a cluster 2024-08-30 11:08:32 -04:00
Joey Hess
d0938d730b
Merge branch 'master' into balanced 2024-08-30 11:01:39 -04:00
Joey Hess
242c525659
lookupkey: Allow using --ref in a bare repository. 2024-08-30 10:55:48 -04:00
yarikoptic
e2b7895cbc Added a comment 2024-08-29 18:35:47 +00:00
Joey Hess
f89a1b8216
remove stale live changes from reposize database
Reorganized the reposize database directory, and split up a column.

checkStaleSizeChanges needs to run before needLiveUpdate,
otherwise the process won't be holding a lock on its pid file, and
another process could go in and expire the live update it records. It
just so happens that they do get called in the correct order, since
checking balanced preferred content calls getLiveRepoSizes before
needLiveUpdate.

The 1 minute delay between checks is arbitrary, but will avoid excess
work. The downside of it is that, if a process is dropping a file and
gets interrupted, for 1 minute another process can expect a repository
will soon be smaller than it is. And so a process might send data to a
repository when a file is not really going to be dropped from it. But
note that can already happen if a drop takes some time in eg locking and
then fails. So it seems possible that live updates should only be
allowed to increase, rather than decrease the size of a repository.
2024-08-28 13:57:25 -04:00
Joey Hess
278adbb726
combine 2 queries 2024-08-28 11:00:59 -04:00
Joey Hess
e006acef22
avoid reposize database locking overhead when not needed
Only when the preferred content expression being matched uses balanced
preferred content is this overhead needed.

It might be possible to eliminate the locking entirely. Eg, check the
live changes before and after the action and re-run if they are not
stable. For now, this is good enough, it avoids existing preferred
content getting slow. If balanced preferred content turns out to be too
slow to check, that could be tried later.
2024-08-28 10:52:34 -04:00
matrss
833150fd25 Added a comment 2024-08-28 14:11:36 +00:00
mih
16f9042046 Added a comment: Needed to retrieve single file metadata from bare repo 2024-08-28 13:58:30 +00:00
matrss
3f62116d64 Added a comment 2024-08-28 08:47:33 +00:00
Joey Hess
0a119184e6
thoughts 2024-08-27 14:59:13 -04:00
Joey Hess
8555fb88ef
locking in checkLiveUpdate
This makes sure that two threads don't check balanced preferred content at the
same time, so each thread always sees a consistent picture of what is
happening.

This does add a fairly expensive file level lock to every check of
preferred content, in commands that use prepareLiveUpdate. It would
be good to only do that when live updates are actually needed, eg when
the preferred content expression uses balanced preferred content.
2024-08-27 13:12:43 -04:00
Joey Hess
4d2f95853d
closing in on finishing live reposizes
Fixed successfullyFinishedLiveSizeChange to not update the rolling total
when a redundant change is in RecentChanges.

Made setRepoSizes clear RecentChanges that are no longer needed.
It might be possible to clear those earlier, this is only a convenient
point to do it.

The reason it's safe to clear RecentChanges here is that, in order for a
live update to call successfullyFinishedLiveSizeChange, a change must be
made to a location log. If a RecentChange gets cleared, and just after
that a new live update is started, making the same change, the location
log has already been changed (since the RecentChange exists), and
so when the live update succeeds, it won't call
successfullyFinishedLiveSizeChange. The reason it doesn't
clear RecentChanges when there is a reduntant live update is because
I didn't want to think through whether or not all races are avoided in
that case.

The rolling total in SizeChanges is never cleared. Instead,
calcJournalledRepoSizes gets the initial value of it, and then
getLiveRepoSizes subtracts that initial value from the current value.
Since the rolling total can only be updated by updateRepoSize,
which is called with the journal locked, locking the journal in
calcJournalledRepoSizes ensures that the database does not change while
reading the journal.
2024-08-27 12:54:46 -04:00
Spencer
949be665c0 Added contributions section to track my bugs and inquiries 2024-08-26 20:02:03 +00:00
Joey Hess
21608716bd
started work on getLiveRepoSizes
Doesn't quite compile
2024-08-26 14:50:09 -04:00
Joey Hess
db89e39df6
partially fix concurrency issue in updating the rollingtotal
It's possible for two processes or threads to both be doing the same
operation at the same time. Eg, both dropping the same key. If one
finishes and updates the rollingtotal, then the other one needs to be
prevented from later updating the rollingtotal as well. And they could
finish at the same time, or with some time in between.

Addressed this by making updateRepoSize be called with the journal
locked, and only once it's been determined that there is an actual
location change to record in the log. updateRepoSize waits for the
database to be updated.

When there is a redundant operation, updateRepoSize won't be called,
and the redundant LiveUpdate will be removed from the database on
garbage collection.

But: There will be a window where the redundant LiveUpdate is still
visible in the db, and processes can see it, combine it with the
rollingtotal, and arrive at the wrong size. This is a small window, but
it still ought to be addressed. Unsure if it would always be safe to
remove the redundant LiveUpdate? Consider the case where two drops and a
get are all running concurrently somehow, and the order they finish is
[drop, get, drop]. The second drop seems redundant to the first, but
it would not be safe to remove it. While this seems unlikely, it's hard
to rule out that a get and drop at different stages can both be running
at the same time.
2024-08-26 09:43:32 -04:00
Joey Hess
03c7f99957
todo 2024-08-25 10:48:42 -04:00
Joey Hess
2b037d36a1
update 2024-08-24 15:06:00 -04:00
Joey Hess
6660984442
update 2024-08-24 13:15:39 -04:00
Joey Hess
d60a33fd13
improve live update starting
In an expression like "balanced=foo and exclude=bar", avoid it starting
a live update when the overall expression doesn't match.
2024-08-24 13:07:05 -04:00
Joey Hess
16f945459c
todo 2024-08-24 11:58:17 -04:00
Joey Hess
2f20b939b7
LiveUpdate db updates working
I've tested the behavior of the thread that waits for the LiveUpdate to
be finished, and it does get signaled and exit cleanly when the
LiveUpdate is GCed instead.

Made finishedLiveUpdate wait for the thread to finish updating the
database.

There is a case where GC doesn't happen in time and the database is left
with a live update recorded in it. This should not be a problem as such
stale data can also happen when interrupted and will need to be detected
when loading the database.

Balanced preferred content expressions now call startLiveUpdate.
2024-08-24 11:49:58 -04:00
Joey Hess
84d1bb746b
LiveUpdate for clusters 2024-08-24 10:20:12 -04:00
Joey Hess
18cd8bf43a
punt on LiveUpdate plumbing through assistant for now 2024-08-24 09:37:24 -04:00
yarikoptic
efdee386c0 initial report on desire to do handle pathspecs 2024-08-24 01:35:31 +00:00
yarikoptic
c3877f648c initial idea on another ability for get 2024-08-24 01:23:04 +00:00
Joey Hess
c3d40b9ec3
plumb in LiveUpdate (WIP)
Each command that first checks preferred content (and/or required
content) and then does something that can change the sizes of
repositories needs to call prepareLiveUpdate, and plumb it through the
preferred content check and the location log update.

So far, only Command.Drop is done. Many other commands that don't need
to do this have been updated to keep working.

There may be some calls to NoLiveUpdate in places where that should be
done. All will need to be double checked.

Not currently in a compilable state.
2024-08-23 16:35:12 -04:00
Joey Hess
4885073377
add live size changes to RepoSize database
Not yet used.
2024-08-23 12:51:00 -04:00
Joey Hess
dad1fb150f
update 2024-08-23 11:45:36 -04:00
Joey Hess
d0ab1550ec
possible design to address reposizes concurrency issues 2024-08-23 11:19:38 -04:00
gauss@055c9051f507c97fa5612f46c74ce636f5ecde10
d71ca87bc9 Added a comment: No root privileges server - annex-shell replaced by git-annex-shell 2024-08-23 01:51:49 +00:00
Joey Hess
8ade3fc5d6
improve docs 2024-08-22 08:09:10 -04:00
Joey Hess
abdd49d8c1
update 2024-08-22 07:53:56 -04:00
Joey Hess
173500872f
update 2024-08-22 07:17:04 -04:00
Joey Hess
70e2fca257
Added the annex.fullybalancedthreshhold git config. 2024-08-22 07:15:55 -04:00
Joey Hess
3fe67744b1
display new empty repos in maxsize table
A new repo that has no location log info yet, but has an entry in
uuid.log has 0 size, so make RepoSize aware of that.

Note that a new repo that does not yet appear in uuid.log will still not
be displayed.

When a remote is added but not synced with yet, it has no uuid.log
entry. If git-annex maxsize is used to configure that remote, it needs
to appear in the maxsize table, and the change to Command.MaxSize takes
care of that.
2024-08-22 07:03:22 -04:00
Spencer
acaa8e9cd5 Added a comment: Precise Workflow 2024-08-22 00:18:28 +00:00
Joey Hess
76ece2a699
make --rebalance of balanced use fullysizebalanced when useful
When the specified number of copies is > 1, and some repositories are
too full, it can be better to move content from them to other less full
repositories, in order to make space for new content.

annex.fullybalancedthreshhold is documented, but not implemented yet

This is not tested very well yet, and is known to sometimes take several
runs to stabalize.
2024-08-21 17:59:08 -04:00
Joey Hess
9e87061de2
Support "sizebalanced=" and "fullysizebalanced=" too
Might want to make --rebalance turn balanced=group:N where N > 1
to fullysizebalanced=group:N. Have not yet determined if that will
improve situations enough to be worth the extra work.
2024-08-21 15:01:54 -04:00
Joey Hess
4e1dcc0372
bug 2024-08-21 12:18:31 -04:00
Joey Hess
476d223bce
implement fullbalanced=group:N
Rebalancing this when it gets into a suboptimal situation will need
further work.
2024-08-20 13:51:02 -04:00
Matthew
4a9e637d36 Added a comment: Help with .nfsXXXX files 2024-08-19 21:20:59 +00:00
matrss
9cfdae4c3b Added a comment 2024-08-19 10:25:13 +00:00
Joey Hess
68a99a8f48
size based rebalancing design 2024-08-18 16:25:12 -04:00
Joey Hess
99514f9d18
maxsize overview display and --json support 2024-08-18 12:08:13 -04:00
xentac
74b953cded Added a comment 2024-08-18 03:17:12 +00:00
Joey Hess
f985c58d8e
consistently don't show sizes of empty repositories
This used to be the case, and when matching options are used, that code
path still omits them, so also omit them in the getRepoSize code path.
2024-08-17 15:09:16 -04:00
Joey Hess
b62b58b50b
git-annex info speed up using getRepoSizes 2024-08-17 14:54:31 -04:00
Joey Hess
d09a005f2b
update RepoSize database from git-annex branch incrementally
The use of catObjectStream is optimally fast. Although it might be
possible to combine this with git-annex branch merge to avoid some
redundant work.

Benchmarking, a git-annex branch that had 100000 files changed
took less than 1.88 seconds to run through this.
2024-08-17 13:35:00 -04:00
Spencer
40b49e2ddd Added a comment: Remote Helper? 2024-08-17 05:33:01 +00:00
matrss
bcf876e3a0 2024-08-16 15:52:32 +00:00
matrss
f057010086 Added a comment 2024-08-16 15:45:45 +00:00
Joey Hess
61d95627f3
fix Annex.repoSize sharing between threads 2024-08-16 10:56:51 -04:00
Joey Hess
e361b9ea3c
todo 2024-08-15 16:15:48 -04:00
Joey Hess
63ccf6ffa7
todo 2024-08-15 13:50:50 -04:00
Joey Hess
4a0c7e2b2c
update 2024-08-15 13:41:47 -04:00
Joey Hess
a2da9c526b
RepoSize concurrency fix
When loading the journalled repo sizes, make sure that the current
process is prevented from making changes to the journal in another
thread.
2024-08-15 13:37:41 -04:00
Joey Hess
06064f897c
update Annex.reposizes when changing location logs
The live update is only needed when Annex.reposizes has already been
populated.
2024-08-15 13:27:14 -04:00
Joey Hess
c376b1bd7e
show message when doing possibly expensive from scratch reposize calculation 2024-08-15 12:42:36 -04:00
Joey Hess
c200523bac
implement getRepoSizes
At this point the RepoSize database is getting populated, and it
all seems to be working correctly. Incremental updates still need to be
done to make it performant.
2024-08-15 12:31:56 -04:00
Joey Hess
eac4e9391b
finalize RepoSize database
Including locking on creation, handling of permissions errors, and
setting repo sizes.

I'm confident that locking is not needed while using this database.
Since writes happen in a single transaction. When there are two writers
that are recording sizes based on different git-annex branch commits,
one will overwrite what the other one recorded. Which is fine, it's only
necessary that the database stays consistent with the content of a
git-annex branch commit.
2024-08-15 12:29:34 -04:00
Atemu
e8997d8899 Added a comment 2024-08-15 15:40:20 +00:00
Joey Hess
3e6eb2a58d
implement journalledRepoSizes
Plan is to run this when populating Annex.reposizes on demand.
So Annex.reposizes will be up-to-date with the journal, including
crucially journal entries for private repositories. But also
anything that has been written to the journal by another process,
especially if the process was ran with annex.alwayscommit=false.

From there, Annex.reposizes can be kept up to date with changes made
by the running process.
2024-08-14 13:53:24 -04:00
pedro-lopes-de-azevedo
c75ecc5350 Added a comment: parameter --from not accepted 2024-08-14 14:27:54 +00:00
bvaa
11eb2ae6ec Added a comment 2024-08-14 07:18:26 +00:00
Joey Hess
90a79a6c1e
plan 2024-08-13 15:13:30 -04:00
Joey Hess
a979d8da41
update 2024-08-13 14:14:47 -04:00
Joey Hess
10d8b3cc63
fixed --rebalance stability on drop
Was checking the wrong uuid, oops
2024-08-13 13:32:11 -04:00
Joey Hess
745bc5c547
take maxsize into account for balanced preferred content
This is very innefficient, it will need to be optimised not to
calculate the sizes of repos every time.

Also, fixed a bug in balancedPicker that caused it to pick a too high
index when some repos were excluded due to being full.
2024-08-13 11:00:20 -04:00
Spencer
05a62e4e5f Added a comment: Workaround: --force-small 2024-08-13 07:05:57 +00:00
Spencer
3d252da06c Added a comment: Exact Moment Things Go Wrong 2024-08-13 06:22:11 +00:00
Spencer
ab5f920d77 .md linting 2024-08-13 04:46:53 +00:00
Spencer
8a91a8c208 2024-08-13 04:46:10 +00:00
Spencer
c4296fbd45 Added a comment: Still a Problem (on Mac?) 2024-08-13 04:21:33 +00:00
ewen
491cf67ce2 Added a comment: Most servers upgraded to TLS v1.2 EMS / TLS v1.3 2024-08-13 00:01:05 +00:00
Joey Hess
b201792391
update 2024-08-12 18:57:03 -04:00
Joey Hess
1e799e7842
update 2024-08-12 11:56:52 -04:00
Joey Hess
71043fe9f7
update 2024-08-12 10:01:48 -04:00
Joey Hess
bcd2b9a5c4
idea 2024-08-12 09:43:14 -04:00
Joey Hess
1265d7e5df
implement maxsize log and command
* maxsize: New command to tell git-annex how large the expected maximum
  size of a repository is.
* vicfg: Include maxsize configuration.
2024-08-11 15:41:26 -04:00
Joey Hess
3019b21c40
more formal documentation of balancing 2024-08-11 13:29:06 -04:00
Joey Hess
bd5affa362
use hmac in balanced preferred content
This deals with the possible security problem that someone could make an
unusually low UUID and generate keys that are all constructed to hash to
a number that, mod the number of repositories in the group, == 0.
So balanced preferred content would always put those keys in the
repository with the low UUID as long as the group contains the
number of repositories that the attacker anticipated.
Presumably the attacker than holds the data for ransom? Dunno.

Anyway, the partial solution is to use HMAC (sha256) with all the UUIDs
combined together as the "secret", and the key as the "message". Now any
change in the set of UUIDs in a group will invalidate the attacker's
constructed keys from hashing to anything in particular.

Given that there are plenty of other things someone can do if they can
write to the repository -- including modifying preferred content so only
their repository wants files, and numcopies so other repositories drom
them -- this seems like safeguard enough.

Note that, in balancedPicker, combineduuids is memoized.
2024-08-10 16:32:54 -04:00
Joey Hess
bde58e6c71
todo 2024-08-09 16:57:10 -04:00
Joey Hess
412f6057e4
todo 2024-08-09 16:47:28 -04:00
xentac
fb186ab0a8 Added a comment 2024-08-09 19:31:12 +00:00
xentac
55a5cb7904 2024-08-09 19:22:19 +00:00
Joey Hess
f1cb5cb908
wrote git-annex maxsize man page 2024-08-09 14:57:11 -04:00
Joey Hess
5a6afff3d6
left off number option 2024-08-09 14:22:05 -04:00
Joey Hess
3ce2e95a5f
balanced preferred content and --rebalance
This all works fine. But it doesn't check repository sizes yet, and
without repository size checking, once a repository gets full, there
will be no other repository that will want its files.

Use of sha2 seems unncessary, probably alder2 or md5 or crc would have
been enough. Possibly just summing up the bytes of the key mod the number
of repositories would have sufficed. But sha2 is there, and probably
hardware accellerated. I doubt very much there is any security benefit
to using it though. If someone wants to construct a key that will be
balanced onto a given repository, sha2 is certianly not going to stop
them.
2024-08-09 14:16:09 -04:00
Joey Hess
152c87140b
update 2024-08-08 16:06:02 -04:00
Joey Hess
0959bfe5d3
update for exporttree=yes 2024-08-08 15:51:36 -04:00
Joey Hess
727b6a0b6d
update 2024-08-08 15:34:36 -04:00
Joey Hess
2616056cde
Merge branch 'exportreeplus' 2024-08-08 15:31:57 -04:00
Joey Hess
3b758aaad6
add news item for git-annex 10.20240808 2024-08-08 15:27:11 -04:00
Joey Hess
3ea835c7e8
proxied exporttree=yes versionedexport=yes remotes are not untrusted
This removes versionedExport, which was only used by the S3 special
remote. Instead, versionedexport=yes is a common way for remotes to
indicate that they are versioned.
2024-08-08 15:24:19 -04:00
Joey Hess
5c36177e58
proxied exporttree=yes remotes are untrustworthy
This is not perfect because it does not handle versioned special
remotes, which should not be untrustworthy, but now are when proxied.

The implementation turned out to be easy, because the exporttree field
is a default field, so is available in RemoteConfig even for git
remotes.
2024-08-08 14:43:53 -04:00