module Ganeti.MaintD.HandleIncidents
( handleIncidents
) where
import Control.Arrow ((&&&))
import Control.Exception.Lifted (bracket)
import Control.Lens.Setter (over)
import Control.Monad (foldM)
import Control.Monad.IO.Class (liftIO)
import qualified Data.ByteString.UTF8 as UTF8
import Data.Function (on)
import Data.IORef (IORef)
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Text.JSON as J
import Ganeti.BasicTypes ( GenericResult(..), ResultT, mkResultT, Down(..))
import qualified Ganeti.Constants as C
import Ganeti.HTools.AlgorithmParams (AlgorithmOptions(..), defaultOptions)
import Ganeti.HTools.Cluster.Evacuate (tryNodeEvac, EvacSolution(..))
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Group as Group
import qualified Ganeti.HTools.Instance as Instance
import qualified Ganeti.HTools.Node as Node
import Ganeti.HTools.Types (Idx)
import Ganeti.JQueue (currentTimestamp)
import Ganeti.Jobs (execJobsWaitOkJid, submitJobs, forceFailover)
import Ganeti.Logging.Lifted
import qualified Ganeti.Luxi as L
import Ganeti.MaintD.MemoryState ( MemoryState, getIncidents, rmIncident
, updateIncident, appendJobs)
import Ganeti.MaintD.Utils (annotateOpCode, getRepairCommand)
import Ganeti.Objects.Lens (incidentJobsL)
import Ganeti.Objects.Maintenance ( RepairStatus(..), RepairAction(..)
, Incident(..))
import Ganeti.OpCodes (OpCode(..), MetaOpCode)
import qualified Ganeti.Path as Path
import Ganeti.Types ( cTimeOf, uuidOf, mkNonEmpty, fromJobId
, EvacMode(..), TagKind(..))
import Ganeti.Utils (maxBy, logAndBad)
moreSevereIncident :: Incident -> Incident -> Incident
moreSevereIncident = maxBy (compare `on` incidentAction &&& (Down . cTimeOf))
rankIncidents :: [Incident] -> Map.Map String Incident
rankIncidents = foldl (\m i -> Map.insertWith moreSevereIncident
(incidentNode i) i m) Map.empty
drainJob :: String -> ResultT String IO [ MetaOpCode ]
drainJob name = do
name' <- mkNonEmpty name
now <- liftIO currentTimestamp
return $ map (annotateOpCode ("Draining " ++ name) now)
[ OpNodeSetParams { opNodeName = name'
, opNodeUuid = Nothing
, opForce = True
, opHvState = Nothing
, opDiskState = Nothing
, opMasterCandidate = Nothing
, opOffline = Nothing
, opDrained = Just True
, opAutoPromote = False
, opMasterCapable = Nothing
, opVmCapable = Nothing
, opSecondaryIp = Nothing
, opgenericNdParams = Nothing
, opPowered = Nothing
, opVerbose = False
, opDebug = False
} ]
handleEvacuation :: L.Client
-> IORef MemoryState
-> (Group.List, Node.List, Instance.List)
-> Idx
-> Bool
-> Set.Set Int
-> Incident
-> ResultT String IO (Set.Set Int)
handleEvacuation client memst (gl, nl, il) ndx migrate freenodes incident = do
let node = Container.find ndx nl
name = Node.name node
fNdNames = map (Node.name . flip Container.find nl) $ Set.elems freenodes
evacOpts = defaultOptions { algEvacMode = True
, algIgnoreSoftErrors = True
, algRestrictToNodes = Just fNdNames
}
evacFun = tryNodeEvac evacOpts gl nl il
migrateFun = if migrate then id else forceFailover
annotateFun = annotateOpCode $ "Evacuating " ++ name
pendingIncident = incident { incidentRepairStatus = RSPending }
updateJobs jids_r = case jids_r of
Ok jids -> do
let incident' = over incidentJobsL (++ jids) pendingIncident
liftIO $ updateIncident memst incident'
liftIO $ appendJobs memst jids
logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
Bad e -> mkResultT . logAndBad
$ "Failure evacuating " ++ name ++ ": " ++ e
logInstName i = logInfo $ "Evacuating instance "
++ Instance.name (Container.find i il)
++ " from " ++ name
execSol sol = do
now <- liftIO currentTimestamp
let jobs = map (map (annotateFun now . migrateFun)) $ esOpCodes sol
jids <- liftIO $ submitJobs jobs client
updateJobs jids
let touched = esMoved sol >>= \(_, _, nidxs) -> nidxs
return $ freenodes Set.\\ Set.fromList touched
logDebug $ "Handling evacuation of " ++ name
case () of _ | not $ Node.offline node -> do
logDebug $ "Draining node " ++ name
job <- drainJob name
jids <- liftIO $ submitJobs [job] client
updateJobs jids
return freenodes
| i:_ <- Node.pList node -> do
logInstName i
(_, _, sol) <- mkResultT . return $ evacFun ChangePrimary [i]
execSol sol
| i:_ <- Node.sList node -> do
logInstName i
(_, _, sol) <- mkResultT . return
$ evacFun ChangeSecondary [i]
execSol sol
| otherwise -> do
logDebug $ "Finished evacuation of " ++ name
now <- liftIO currentTimestamp
jids <- mkResultT $ execJobsWaitOkJid
[[ annotateFun now
. OpTagsSet TagKindNode [ incidentTag incident ]
$ Just name]] client
let incident' = over incidentJobsL (++ jids)
$ incident { incidentRepairStatus =
RSCompleted }
liftIO $ updateIncident memst incident'
liftIO $ appendJobs memst jids
return freenodes
handleLiveRepairs :: L.Client
-> IORef MemoryState
-> Idx
-> Set.Set Int
-> Incident
-> ResultT String IO (Set.Set Int)
handleLiveRepairs client memst ndx freenodes incident = do
let maybeCmd = getRepairCommand incident
uuid = incidentUuid incident
name = incidentNode incident
now <- liftIO currentTimestamp
logDebug $ "Handling requested command " ++ show maybeCmd ++ " on " ++ name
case () of
_ | null $ incidentJobs incident,
Just cmd <- maybeCmd,
cmd /= "" -> do
logDebug "Submitting repair command job"
name' <- mkNonEmpty name
cmd' <- mkNonEmpty cmd
orig' <- mkNonEmpty . J.encode $ incidentOriginal incident
jids_r <- liftIO $ submitJobs
[[ annotateOpCode "repair command requested by node" now
OpRepairCommand { opNodeName = name'
, opRepairCommand = cmd'
, opInput = Just orig'
} ]] client
case jids_r of
Ok jids -> do
let incident' = over incidentJobsL (++ jids) incident
liftIO $ updateIncident memst incident'
liftIO $ appendJobs memst jids
logDebug $ "Jobs submitted: " ++ show (map fromJobId jids)
Bad e -> mkResultT . logAndBad
$ "Failure requesting command " ++ cmd ++ " on " ++ name
++ ": " ++ e
| null $ incidentJobs incident -> do
logInfo $ "Marking incident " ++ UTF8.toString uuid ++ " as failed;"
++ " command for live repair not specified"
let newtag = C.maintdFailureTagPrefix ++ UTF8.toString uuid
jids <- mkResultT $ execJobsWaitOkJid
[[ annotateOpCode "marking incident as ill specified" now
. OpTagsSet TagKindNode [ newtag ]
$ Just name ]] client
let incident' = over incidentJobsL (++ jids)
$ incident { incidentRepairStatus = RSFailed
, incidentTag = newtag
}
liftIO $ updateIncident memst incident'
liftIO $ appendJobs memst jids
| otherwise -> do
logDebug "Command execution has succeeded"
jids <- mkResultT $ execJobsWaitOkJid
[[ annotateOpCode "repair command requested by node" now
. OpTagsSet TagKindNode [ incidentTag incident ]
$ Just name ]] client
let incident' = over incidentJobsL (++ jids)
$ incident { incidentRepairStatus = RSCompleted }
liftIO $ updateIncident memst incident'
liftIO $ appendJobs memst jids
return $ Set.delete ndx freenodes
handleIncident :: L.Client
-> IORef MemoryState
-> (Group.List, Node.List, Instance.List)
-> Set.Set Int
-> (String, Incident)
-> ResultT String IO (Set.Set Int)
handleIncident client memstate (gl, nl, il) freeNodes (name, incident) = do
ndx <- case Container.keys $ Container.filter ((==) name . Node.name) nl of
[ndx] -> return ndx
[] -> do
logWarning $ "Node " ++ name ++ " no longer in the cluster;"
++ " clearing incident " ++ show incident
liftIO . rmIncident memstate $ uuidOf incident
fail $ "node " ++ name ++ " left the cluster"
ndxs -> do
logWarning $ "Abmigious node name " ++ name
++ "; could refer to indices " ++ show ndxs
fail $ "ambigious name " ++ name
case incidentAction incident of
RANoop -> do
logDebug $ "Nothing to do for " ++ show incident
liftIO . rmIncident memstate $ uuidOf incident
return freeNodes
RALiveRepair ->
handleLiveRepairs client memstate ndx freeNodes incident
RAEvacuate ->
handleEvacuation client memstate (gl, nl, il) ndx True freeNodes incident
RAEvacuateFailover ->
handleEvacuation client memstate (gl, nl, il) ndx False freeNodes incident
handleIncidents :: IORef MemoryState
-> (Group.List, Node.List, Instance.List)
-> ResultT String IO (Set.Set Int)
handleIncidents memstate (gl, nl, il) = do
incidents <- getIncidents memstate
let activeIncidents = filter ((<= RSPending) . incidentRepairStatus) incidents
incidentsToHandle = rankIncidents activeIncidents
incidentNodes = Set.fromList . Container.keys
$ Container.filter ((`Map.member` incidentsToHandle) . Node.name) nl
freeNodes = Set.fromList (Container.keys nl) Set.\\ incidentNodes
if null activeIncidents
then return freeNodes
else do
luxiSocket <- liftIO Path.defaultQuerySocket
bracket (liftIO $ L.getLuxiClient luxiSocket)
(liftIO . L.closeClient)
$ \ client ->
foldM (handleIncident client memstate (gl, nl, il)) freeNodes
$ Map.assocs incidentsToHandle