module Ganeti.MaintD.Server
( options
, main
, checkMain
, prepMain
) where
import Control.Applicative ((<|>))
import Control.Concurrent (forkIO)
import Control.Exception.Lifted (bracket)
import Control.Monad (forever, void, unless, when, liftM)
import Control.Monad.IO.Class (liftIO)
import Data.IORef (IORef, newIORef, readIORef)
import qualified Data.Set as Set
import Snap.Core (Snap, method, Method(GET), ifTop, dir, route)
import Snap.Http.Server (httpServe)
import Snap.Http.Server.Config (Config)
import System.IO.Error (tryIOError)
import System.Time (getClockTime)
import qualified Text.JSON as J
import Ganeti.BasicTypes ( GenericResult(..), ResultT, runResultT, mkResultT
, withErrorT, isBad)
import qualified Ganeti.Constants as C
import Ganeti.Daemon ( OptType, CheckFn, PrepFn, MainFn, oDebug
, oNoVoting, oYesDoIt, oPort, oBindAddress, oNoDaemonize)
import Ganeti.Daemon.Utils (handleMasterVerificationOptions)
import qualified Ganeti.HTools.Backend.Luxi as Luxi
import Ganeti.HTools.Loader (ClusterData(..), mergeData, checkData)
import Ganeti.Jobs (waitForJobs)
import Ganeti.Logging.Lifted
import qualified Ganeti.Luxi as L
import Ganeti.MaintD.Autorepairs (harepTasks)
import Ganeti.MaintD.Balance (balanceTask)
import Ganeti.MaintD.CleanupIncidents (cleanupIncidents)
import Ganeti.MaintD.CollectIncidents (collectIncidents)
import Ganeti.MaintD.FailIncident (failIncident)
import Ganeti.MaintD.HandleIncidents (handleIncidents)
import Ganeti.MaintD.MemoryState
import qualified Ganeti.Path as Path
import Ganeti.Runtime (GanetiDaemon(GanetiMaintd))
import Ganeti.Types (JobId(..), JobStatus(..))
import Ganeti.Utils (threadDelaySeconds)
import Ganeti.Utils.Http (httpConfFromOpts, plainJSON, error404)
import Ganeti.WConfd.Client ( runNewWConfdClient, maintenanceRoundDelay
, maintenanceBalancing)
options :: [OptType]
options =
[ oNoDaemonize
, oDebug
, oPort C.defaultMaintdPort
, oBindAddress
, oNoVoting
, oYesDoIt
]
type CheckResult = ()
type PrepResult = Config Snap ()
loadClusterData :: ResultT String IO ClusterData
loadClusterData = do
now <- liftIO getClockTime
socket <- liftIO Path.defaultQuerySocket
either_inp <- liftIO . tryIOError $ Luxi.loadData False socket
input_data <- mkResultT $ case either_inp of
Left e -> do
let msg = show e
logNotice $ "Couldn't read data from luxid: " ++ msg
return $ Bad msg
Right r -> return r
cdata <- mkResultT . return $ mergeData [] [] [] [] now input_data
let (msgs, nl) = checkData (cdNodes cdata) (cdInstances cdata)
unless (null msgs) . logDebug $ "Cluster data inconsistencies: " ++ show msgs
return $ cdata { cdNodes = nl }
maintenance :: IORef MemoryState -> ResultT String IO ()
maintenance memstate = do
delay <- withErrorT show $ runNewWConfdClient maintenanceRoundDelay
liftIO $ threadDelaySeconds delay
oldjobs <- getJobs memstate
logDebug $ "Jobs submitted in the last round: "
++ show (map fromJobId oldjobs)
luxiSocket <- liftIO Path.defaultQuerySocket
jobresults <- bracket (mkResultT . liftM (either (Bad . show) Ok)
. tryIOError $ L.getLuxiClient luxiSocket)
(liftIO . L.closeClient)
$ mkResultT . waitForJobs oldjobs
let failedjobs = map fst $ filter ((/=) JOB_STATUS_SUCCESS . snd) jobresults
unless (null failedjobs) $ do
logInfo . (++) "Failed jobs: " . show $ map fromJobId failedjobs
mapM_ (failIncident memstate) failedjobs
unless (null oldjobs)
. liftIO $ clearJobs memstate
logDebug "New round of maintenance started"
cData <- loadClusterData
let il = cdInstances cData
nl = cdNodes cData
gl = cdGroups cData
cleanupIncidents memstate nl
collectIncidents memstate nl
nidxs <- handleIncidents memstate (gl, nl, il)
(nidxs', jobs) <- harepTasks (nl, il) nidxs
unless (null jobs)
. liftIO $ appendJobs memstate jobs
logDebug $ "Nodes unaffected by harep " ++ show (Set.toList nidxs')
++ ", jobs submitted " ++ show (map fromJobId jobs)
(bal, thresh) <- withErrorT show $ runNewWConfdClient maintenanceBalancing
when (bal && not (Set.null nidxs')) $ do
logDebug $ "Will balance unaffected nodes, threshold " ++ show thresh
jobs' <- balanceTask memstate (nl, il) nidxs thresh
logDebug $ "Balancing jobs submitted: " ++ show (map fromJobId jobs')
unless (null jobs')
. liftIO $ appendJobs memstate jobs'
exposeState :: J.JSON a => (MemoryState -> a) -> IORef MemoryState -> Snap ()
exposeState selector ref = do
state <- liftIO $ readIORef ref
plainJSON $ selector state
httpInterface :: IORef MemoryState -> Snap ()
httpInterface memstate =
ifTop (method GET $ plainJSON [1 :: Int])
<|> dir "1" (ifTop (plainJSON J.JSNull)
<|> route [ ("jobs", exposeState msJobs memstate)
, ("evacuated", exposeState msEvacuated memstate)
, ("status", exposeState msIncidents memstate)
])
<|> error404
checkMain :: CheckFn CheckResult
checkMain = handleMasterVerificationOptions
prepMain :: PrepFn CheckResult PrepResult
prepMain opts _ = httpConfFromOpts GanetiMaintd opts
main :: MainFn CheckResult PrepResult
main _ _ httpConf = do
memstate <- newIORef emptyMemoryState
void . forkIO . forever $ do
res <- runResultT $ maintenance memstate
(if isBad res then logInfo else logDebug)
$ "Maintenance round result is " ++ show res
when (isBad res) $ do
logDebug "Backing off after a round with internal errors"
threadDelaySeconds C.maintdDefaultRoundDelay
httpServe httpConf $ httpInterface memstate