{-| Utility function for detecting the death of a job holding resources To clean up resources owned by jobs that die for some reason, we need to detect whether a job is still alive. As we have no control over PID reuse, our approach is that each requester for a resource has to provide a file where it owns an exclusive lock on. The kernel will make sure the lock is removed if the process dies. We can probe for such a lock by requesting a shared lock on the file. -} {- Copyright (C) 2014 Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -} module Ganeti.WConfd.DeathDetection ( cleanupLocksTask , cleanupLocks ) where import Control.Concurrent (threadDelay) import qualified Control.Exception as E import Control.Monad import System.Directory (removeFile) import Ganeti.BasicTypes import qualified Ganeti.Constants as C import qualified Ganeti.Locking.Allocation as L import Ganeti.Locking.Locks (ClientId(..)) import Ganeti.Logging.Lifted (logDebug, logInfo) import Ganeti.Utils.Livelock import Ganeti.WConfd.Monad import Ganeti.WConfd.Persistent -- | Interval to run clean-up tasks in microseconds cleanupInterval :: Int cleanupInterval = C.wconfdDeathdetectionIntervall * 1000000 -- | Go through all owners once and clean them up, if they're dead. cleanupLocks :: WConfdMonad () cleanupLocks = do owners <- liftM L.lockOwners readLockAllocation mylivelock <- liftM dhLivelock daemonHandle logDebug $ "Current lock owners: " ++ show owners let cleanupIfDead owner = do let fpath = ciLockFile owner died <- if fpath == mylivelock then return False else liftIO (isDead fpath) when died $ do logInfo $ show owner ++ " died, releasing locks and reservations" persCleanup persistentTempRes owner persCleanup persistentLocks owner _ <- liftIO . E.try $ removeFile fpath :: WConfdMonad (Either IOError ()) return () mapM_ cleanupIfDead owners -- | Thread periodically cleaning up locks of lock owners that died. cleanupLocksTask :: WConfdMonadInt () cleanupLocksTask = forever . runResultT $ do logDebug "Death detection timer fired" cleanupLocks remainingFiles <- liftIO listLiveLocks mylivelock <- liftM dhLivelock daemonHandle logDebug $ "Livelockfiles remaining: " ++ show remainingFiles let cleanupStaleIfDead fpath = do died <- if fpath == mylivelock then return False else liftIO (isDead fpath) when died $ do logInfo $ "Cleaning up stale file " ++ fpath _ <- liftIO . E.try $ removeFile fpath :: WConfdMonad (Either IOError ()) return () mapM_ cleanupStaleIfDead remainingFiles liftIO $ threadDelay cleanupInterval