convert incremental fsck to using sqlite database

Did not keep backwards compat for sticky bit records. An incremental fsck
that is already in progress will start over on upgrade to this version.

This is not yet ready for merging. The autobuilders need to have sqlite
installed.

Also, interrupting a fsck --incremental does not commit the database.
So, resuming with fsck --more restarts from beginning.

Memory: Constant during a fsck of tens of thousands of files.
(But, it does seem to buffer whole transation in memory, so
may really scale with number of files.)

CPU: ?
This commit is contained in:
Joey Hess 2015-02-16 15:08:29 -04:00
parent bb242bdd82
commit 91e9146d1b
9 changed files with 185 additions and 177 deletions

View file

@ -1,6 +1,6 @@
{- git-annex command
-
- Copyright 2010-2013 Joey Hess <id@joeyh.name>
- Copyright 2010-2015 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
@ -26,13 +26,13 @@ import Logs.Trust
import Config.NumCopies
import Annex.UUID
import Utility.DataUnits
import Utility.FileMode
import Config
import Types.Key
import Types.CleanupActions
import Utility.HumanTime
import Git.FilePath
import Utility.PID
import qualified Database.Fsck as FsckDb
import Data.Time.Clock.POSIX
import Data.Time
@ -72,6 +72,7 @@ seek ps = do
(\k -> startKey i k =<< getNumCopies)
(withFilesInGit $ whenAnnexed $ start from i)
ps
withFsckDb i (liftIO . FsckDb.closeDb)
getIncremental :: Annex Incremental
getIncremental = do
@ -82,15 +83,17 @@ getIncremental = do
case (i, starti, morei) of
(False, False, False) -> return NonIncremental
(False, True, False) -> startIncremental
(False ,False, True) -> ContIncremental <$> getStartTime
(False ,False, True) -> contIncremental
(True, False, False) ->
maybe startIncremental (return . ContIncremental . Just)
maybe startIncremental (const contIncremental)
=<< getStartTime
_ -> error "Specify only one of --incremental, --more, or --incremental-schedule"
where
startIncremental = do
recordStartTime
return StartIncremental
FsckDb.newPass
StartIncremental <$> FsckDb.openDb
contIncremental = ContIncremental <$> FsckDb.openDb
checkschedule Nothing = error "bad --incremental-schedule value"
checkschedule (Just delta) = do
@ -415,8 +418,7 @@ badContentRemote remote key = do
return $ (if ok then "dropped from " else "failed to drop from ")
++ Remote.name remote
data Incremental = StartIncremental | ContIncremental (Maybe EpochTime) | NonIncremental
deriving (Eq, Show)
data Incremental = StartIncremental FsckDb.DbHandle | ContIncremental FsckDb.DbHandle | NonIncremental
runFsck :: Incremental -> FilePath -> Key -> Annex Bool -> CommandStart
runFsck inc file key a = ifM (needFsck inc key)
@ -425,48 +427,23 @@ runFsck inc file key a = ifM (needFsck inc key)
next $ do
ok <- a
when ok $
recordFsckTime key
recordFsckTime inc key
next $ return ok
, stop
)
{- Check if a key needs to be fscked, with support for incremental fscks. -}
needFsck :: Incremental -> Key -> Annex Bool
needFsck (ContIncremental Nothing) _ = return True
needFsck (ContIncremental starttime) key = do
fscktime <- getFsckTime key
return $ fscktime < starttime
needFsck (ContIncremental h) key = not <$> FsckDb.inDb h key
needFsck _ _ = return True
{- To record the time that a key was last fscked, without
- modifying its mtime, we set the timestamp of its parent directory.
- Each annexed file is the only thing in its directory, so this is fine.
-
- To record that the file was fscked, the directory's sticky bit is set.
- (None of the normal unix behaviors of the sticky bit should matter, so
- we can reuse this permission bit.)
-
- Note that this relies on the parent directory being deleted when a file
- is dropped. That way, if it's later added back, the fsck record
- won't still be present.
-}
recordFsckTime :: Key -> Annex ()
recordFsckTime key = do
parent <- parentDir <$> calcRepo (gitAnnexLocation key)
liftIO $ void $ tryIO $ do
touchFile parent
#ifndef mingw32_HOST_OS
setSticky parent
#endif
withFsckDb :: Incremental -> (FsckDb.DbHandle -> Annex ()) -> Annex ()
withFsckDb (ContIncremental h) a = a h
withFsckDb (StartIncremental h) a = a h
withFsckDb NonIncremental _ = noop
getFsckTime :: Key -> Annex (Maybe EpochTime)
getFsckTime key = do
parent <- parentDir <$> calcRepo (gitAnnexLocation key)
liftIO $ catchDefaultIO Nothing $ do
s <- getFileStatus parent
return $ if isSticky $ fileMode s
then Just $ modificationTime s
else Nothing
recordFsckTime :: Incremental -> Key -> Annex ()
recordFsckTime inc key = withFsckDb inc $ \h -> FsckDb.addDb h key
{- Records the start time of an incremental fsck.
-

71
Database/Fsck.hs Normal file
View file

@ -0,0 +1,71 @@
{- Sqlite database used for incremental fsck.
-
- Copyright 2015 Joey Hess <id@joeyh.name>
-:
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE QuasiQuotes, TypeFamilies, TemplateHaskell #-}
{-# LANGUAGE OverloadedStrings, GADTs, FlexibleContexts #-}
module Database.Fsck (
newPass,
openDb,
H.closeDb,
H.DbHandle,
addDb,
inDb,
FsckedId,
) where
import Database.Types
import qualified Database.Handle as H
import Locations
import Utility.Directory
import Annex
import Types.Key
import Annex.Perms
import Database.Persist.TH
import Database.Esqueleto hiding (Key)
import Control.Monad
import Control.Monad.IfElse
import Control.Monad.IO.Class (liftIO)
import System.Directory
{- Each key stored in the database has already been fscked as part
- of the latest incremental fsck pass. -}
share [mkPersist sqlSettings, mkMigrate "migrateFsck"] [persistLowerCase|
Fscked
key SKey
UniqueKey key
deriving Show
|]
{- The database is removed when starting a new incremental fsck pass. -}
newPass :: Annex ()
newPass = liftIO. nukeFile =<< fromRepo gitAnnexFsckDb
{- Opens the database, creating it atomically if it doesn't exist yet. -}
openDb :: Annex H.DbHandle
openDb = do
db <- fromRepo gitAnnexFsckDb
unlessM (liftIO $ doesFileExist db) $ do
let newdb = db ++ ".new"
h <- liftIO $ H.openDb newdb
void $ liftIO $ H.runDb h $
runMigrationSilent migrateFsck
liftIO $ H.closeDb h
setAnnexFilePerm newdb
liftIO $ renameFile newdb db
liftIO $ H.openDb db
addDb :: H.DbHandle -> Key -> Annex ()
addDb h = void . liftIO . H.runDb h . insert . Fscked . toSKey
inDb :: H.DbHandle -> Key -> Annex Bool
inDb h k = liftIO $ H.runDb h $ do
r <- select $ from $ \r -> do
where_ (r ^. FsckedKey ==. val (toSKey k))
return (r ^. FsckedKey)
return $ not $ null r

63
Database/Handle.hs Normal file
View file

@ -0,0 +1,63 @@
{- Persistent sqlite database handles.
-
- Copyright 2015 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
module Database.Handle (
DbHandle,
openDb,
closeDb,
runDb,
) where
import Utility.Exception
import Database.Persist.Sqlite (runSqlite)
import Database.Esqueleto hiding (Key)
import Control.Monad.IO.Class (liftIO)
import Control.Concurrent
import Control.Concurrent.Async
import Control.Exception (throwIO)
import qualified Data.Text as T
{- A DbHandle is a reference to a worker thread that communicates with
- the database. It has a MVar which Jobs are submitted to. -}
data DbHandle = DbHandle (Async ()) (MVar Job)
data Job = Job (SqlPersistM ()) | CloseJob
openDb :: FilePath -> IO DbHandle
openDb db = do
jobs <- newEmptyMVar
worker <- async (workerThread db jobs)
return $ DbHandle worker jobs
workerThread :: FilePath -> MVar Job -> IO ()
workerThread db jobs = runSqlite (T.pack db) go
where
go = do
job <- liftIO $ takeMVar jobs
case job of
Job a -> a >> go
CloseJob -> return ()
closeDb :: DbHandle -> IO ()
closeDb (DbHandle worker jobs) = do
putMVar jobs CloseJob
wait worker
{- Runs an action using the DbHandle.
-
- Note that the action is not run by the calling thread, but by a
- worker thread. Exceptions are propigated to the calling thread.
-
- Note that only one action can be run at a time against a given DbHandle.
- If called concurrently, this will block until it is able to run.
-}
runDb :: DbHandle -> SqlPersistM a -> IO a
runDb (DbHandle _ jobs) a = do
res <- newEmptyMVar
putMVar jobs $ Job $ liftIO . putMVar res =<< tryNonAsync a
either throwIO return =<< takeMVar res

27
Database/Types.hs Normal file
View file

@ -0,0 +1,27 @@
{- types for SQL databases
-
- Copyright 2015 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU GPL version 3 or higher.
-}
{-# LANGUAGE TemplateHaskell #-}
module Database.Types where
import Database.Persist.TH
import Data.Maybe
import Types.Key
-- A serialized Key
newtype SKey = SKey String
deriving (Show, Read)
toSKey :: Key -> SKey
toSKey = SKey . key2file
fromSKey :: SKey -> Key
fromSKey (SKey s) = fromMaybe (error $ "bad serialied key " ++ s) (file2key s)
derivePersistField "SKey"

View file

@ -1,20 +0,0 @@
{-# LANGUAGE QuasiQuotes, TypeFamilies, GeneralizedNewtypeDeriving, TemplateHaskell,
OverloadedStrings, GADTs, FlexibleContexts #-}
module Footype where
import Database.Persist hiding (Key)
import Database.Persist.TH
import Database.Persist.Sqlite hiding (Key)
import Control.Monad.IO.Class (liftIO)
import Control.Monad
import Data.Time.Clock
import Types.Key
import Types.UUID
import Types.MetaData
-- has to be in a separate file from foo.hs for silly reasons
derivePersistField "Key"
derivePersistField "UUID"
derivePersistField "MetaField"

View file

@ -57,6 +57,7 @@ module Locations (
gitAnnexSshDir,
gitAnnexRemotesDir,
gitAnnexAssistantDefaultDir,
gitAnnexFsckDb,
isLinkToAnnex,
HashLevels(..),
hashDirMixed,
@ -340,6 +341,10 @@ gitAnnexRemotesDir r = addTrailingPathSeparator $ gitAnnexDir r </> "remotes"
gitAnnexAssistantDefaultDir :: FilePath
gitAnnexAssistantDefaultDir = "annex"
{- Database used to record fsck info. -}
gitAnnexFsckDb :: Git.Repo -> FilePath
gitAnnexFsckDb r = gitAnnexDir r </> "fsck.db"
{- Checks a symlink target to see if it appears to point to annexed content.
-
- We only look at paths inside the .git directory, and not at the .git

74
foo.hs
View file

@ -1,74 +0,0 @@
{-# LANGUAGE QuasiQuotes, TypeFamilies, GeneralizedNewtypeDeriving, TemplateHaskell,
OverloadedStrings, GADTs, FlexibleContexts #-}
import Database.Persist hiding (Key)
import Database.Persist.TH
import Database.Persist.Sqlite hiding (Key)
import Control.Monad.IO.Class (liftIO)
import Control.Monad
import Data.Time.Clock
import Data.Maybe
import Types.Key
import Types.UUID
import Types.MetaData
import Footype
data RemoteFsckTime
share [mkPersist sqlSettings, mkSave "entityDefs", mkMigrate "migrateAll"] [persistLowerCase|
CachedKey
key Key
KeyOutdex key
deriving Show
AssociatedFiles
keyId CachedKeyId Eq
associatedFile FilePath
KeyIdOutdex keyId associatedFile
deriving Show
CachedMetaField
field MetaField
FieldOutdex field
deriving Show
CachedMetaData
keyId CachedKeyId Eq
fieldId CachedMetaFieldId Eq
metaValue String
deriving Show
LastFscked
keyId CachedKeyId Eq
localFscked Int Maybe
|]
main :: IO ()
main = query
query :: IO ()
query = runSqlite "foo.db" $ do
forM_ [1..1000] $ \i -> do
Just k <- getBy $ KeyOutdex (fromJust $ file2key $ "WORM--" ++ show i)
selectList [AssociatedFilesKeyId ==. entityKey k] []
query2 :: IO ()
query2 = runSqlite "foo.db" $ do
forM_ [1..1] $ \i -> do
Just f <- getBy $ FieldOutdex (fromJust $ toMetaField "tag")
liftIO $ print f
fs <- selectList [CachedMetaDataFieldId ==. entityKey f] []
liftIO $ print $ length fs
populate :: IO ()
populate = runSqlite "foo.db" $ do
runMigration migrateAll
t <- insert $ CachedMetaField (fromJust $ toMetaField "tag")
f <- insert $ CachedMetaField (fromJust $ toMetaField "foo")
forM_ [1..30000] $ \i -> do
k <- insert $ CachedKey (fromJust $ file2key $ "WORM--" ++ show i)
liftIO $ print k
insert $ AssociatedFiles k (show i)
insert $ AssociatedFiles k ("and" ++ show (i + 1))
insert $ CachedMetaData k f (show i)
insert $ CachedMetaData k t "bar"

View file

@ -1,42 +0,0 @@
{-# LANGUAGE QuasiQuotes, TypeFamilies, GeneralizedNewtypeDeriving, TemplateHaskell,
OverloadedStrings, GADTs, FlexibleContexts #-}
import Database.Persist.TH
import Database.Persist.Sqlite (runSqlite)
import Control.Monad.IO.Class (liftIO)
import Control.Monad
import Database.Esqueleto hiding (Key)
share [mkPersist sqlSettings, mkMigrate "migrateAll"] [persistLowerCase|
CachedKey
key String
UniqueKey key
deriving Show
AssociatedFiles
key CachedKeyId Eq
file FilePath
UniqueKeyFile key file
deriving Show
|]
main :: IO ()
main = runSqlite "foo.db" $ do
runMigration migrateAll
if True then populate else return ()
query
populate = do
forM_ [1..30000] $ \i -> do
--delete $ from $ \f -> do
-- where_ (f ^. CachedKeyKey ==. val (show i))
k <- insert $ CachedKey (show i)
liftIO $ print ("stored", k)
insert $ AssociatedFiles k ("file" ++show (i+1))
--insert $ AssociatedFiles k ("otherfile" ++show (i+2))
query = forM_ [1..1000] $ \i -> do
r <- select $ from $ \(k, f) -> do
where_ (k ^. CachedKeyKey ==. val (show i))
where_ (f ^. AssociatedFilesKey ==. k ^. CachedKeyId)
return (f ^. AssociatedFilesFile)
liftIO $ print ("got", r)

View file

@ -110,7 +110,8 @@ Executable git-annex
IfElse, text, QuickCheck >= 2.1, bloomfilter, edit-distance,
SafeSemaphore, uuid, random, dlist, unix-compat, async, stm (>= 2.3),
data-default, case-insensitive, http-conduit, http-types,
cryptohash (>= 0.10.0)
cryptohash (>= 0.10.0),
esqueleto, persistent-sqlite, persistent, persistent-template
CC-Options: -Wall
GHC-Options: -Wall
Extensions: PackageImports