git-annex/Assistant/Unused.hs
Joey Hess 81d402216d cache the serialization of a Key
This will speed up the common case where a Key is deserialized from
disk, but is then serialized to build eg, the path to the annex object.

Previously attempted in 4536c93bb2
and reverted in 96aba8eff7.
The problems mentioned in the latter commit are addressed now:

Read/Show of KeyData is backwards-compatible with Read/Show of Key from before
this change, so Types.Distribution will keep working.

The Eq instance is fixed.

Also, Key has smart constructors, avoiding needing to remember to update
the cached serialization.

Used git-annex benchmark:
  find is 7% faster
  whereis is 3% faster
  get when all files are already present is 5% faster
Generally, the benchmarks are running 0.1 seconds faster per 2000 files,
on a ram disk in my laptop.
2019-11-22 17:49:16 -04:00

85 lines
2.4 KiB
Haskell

{- git-annex assistant unused files
-
- Copyright 2014 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
-}
{-# LANGUAGE OverloadedStrings #-}
module Assistant.Unused where
import qualified Data.Map as M
import Assistant.Common
import qualified Git
import Logs.Unused
import Logs.Location
import Annex.Content
import Utility.DataUnits
import Utility.DiskFree
import Utility.HumanTime
import Utility.Tense
import Data.Time.Clock.POSIX
import qualified Data.Text as T
describeUnused :: Assistant (Maybe TenseText)
describeUnused = describeUnused' False
describeUnusedWhenBig :: Assistant (Maybe TenseText)
describeUnusedWhenBig = describeUnused' True
{- This uses heuristics: 1000 unused keys, or more unused keys
- than the remaining free disk space, or more than 1/10th the total
- disk space being unused keys all suggest a problem. -}
describeUnused' :: Bool -> Assistant (Maybe TenseText)
describeUnused' whenbig = liftAnnex $ go =<< readUnusedLog ""
where
go m = do
let num = M.size m
let diskused = foldl' sumkeysize 0 (M.keys m)
df <- forpath getDiskFree
disksize <- forpath getDiskSize
return $ if num == 0
then Nothing
else if not whenbig || moreused df diskused || tenthused disksize diskused
then Just $ tenseWords
[ UnTensed $ T.pack $ roughSize storageUnits False diskused
, Tensed "are" "were"
, "taken up by unused files"
]
else if num > 1000
then Just $ tenseWords
[ UnTensed $ T.pack $ show num ++ " unused files"
, Tensed "exist" "existed"
]
else Nothing
moreused Nothing _ = False
moreused (Just df) used = df <= used
tenthused Nothing _ = False
tenthused (Just disksize) used = used >= disksize `div` 10
sumkeysize s k = s + fromMaybe 0 (fromKey keySize k)
forpath a = inRepo $ liftIO . a . Git.repoPath
{- With a duration, expires all unused files that are older.
- With Nothing, expires *all* unused files. -}
expireUnused :: Maybe Duration -> Assistant ()
expireUnused duration = do
m <- liftAnnex $ readUnusedLog ""
now <- liftIO getPOSIXTime
let oldkeys = M.keys $ M.filter (tooold now) m
forM_ oldkeys $ \k -> do
debug ["removing old unused key", serializeKey k]
liftAnnex $ tryNonAsync $ do
lockContentForRemoval k removeAnnex
logStatus k InfoMissing
where
boundry = durationToPOSIXTime <$> duration
tooold now (_, mt) = case boundry of
Nothing -> True
Just b -> maybe False (\t -> now - t >= b) mt