From b40b368857bd1bc5f18526621fb266d70c5597f3 Mon Sep 17 00:00:00 2001 From: Joey Hess Date: Fri, 2 Jun 2023 12:13:50 -0400 Subject: [PATCH] comment --- ..._1389db945973ed42b8ddd0de3cc8889c._comment | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 doc/bugs/importtree_spends_hours_reading_cidsdb/comment_8_1389db945973ed42b8ddd0de3cc8889c._comment diff --git a/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_8_1389db945973ed42b8ddd0de3cc8889c._comment b/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_8_1389db945973ed42b8ddd0de3cc8889c._comment new file mode 100644 index 0000000000..e30e0932dc --- /dev/null +++ b/doc/bugs/importtree_spends_hours_reading_cidsdb/comment_8_1389db945973ed42b8ddd0de3cc8889c._comment @@ -0,0 +1,33 @@ +[[!comment format=mdwn + username="joey" + subject="""comment 8""" + date="2023-06-02T15:25:12Z" + content=""" +@jgoerzen if you want to take a look at the sql, see +`Database/ContentIdentifier.hs`. `getContentIdentifierKeys` is the query +that it's running on each file. I'm not really sure right now if the +persistent schema in there actually creates an index that is used for that +query. persistent's documentation of indexes is lacking and I may have +misunderstood that uniqueness constraints result in indexes being created. + +Dumping the database shows this, which really doesn't seem to have an index +after all: + + CREATE TABLE IF NOT EXISTS "content_identifiers"("id" INTEGER + PRIMARY KEY,"remote" BLOB NOT NULL,"cid" BLOB NOT NULL,"key" BLOB + NOT NULL,CONSTRAINT "content_indentifiers_key_remote_cid_index" + UNIQUE ("key","remote","cid")); + +May need some raw sql to add it, like: + + CREATE INDEX cidindex ON "content_identifiers" ("cid"); + +Also, I re-ran the 150000 file sync benchmark with `getContentIdentifierKeys` +disabled and it took 29:56.78, so 25% faster. + +That gives me the idea for an optimisation -- it could check if the +database is empty at start and if so, avoid calling that at all. (It also +maintains a map in memory which will still allow it to detect duplicate files.) +Speeding up initial imports of a lot of files, but not later imports of a lot +of files is kind of a cop out, but.. +"""]]