module Ganeti.HTools.Program.Harep
( main
, arguments
, options) where
import Control.Exception (bracket)
import Control.Lens (over)
import Control.Monad
import Data.Maybe
import System.Time
import qualified Data.Map as Map
import qualified Text.JSON as J
import Ganeti.BasicTypes
import Ganeti.Common
import Ganeti.Errors
import Ganeti.JQueue (currentTimestamp, reasonTrailTimestamp)
import Ganeti.JQueue.Objects (Timestamp)
import Ganeti.Jobs
import Ganeti.OpCodes
import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
import Ganeti.Types
import Ganeti.Utils
import qualified Ganeti.Luxi as L
import qualified Ganeti.Path as Path
import Ganeti.HTools.CLI
import qualified Ganeti.HTools.Container as Container
import Ganeti.HTools.Loader
import Ganeti.HTools.ExtLoader
import Ganeti.HTools.Repair
import Ganeti.HTools.Types
import qualified Ganeti.HTools.Instance as Instance
import Ganeti.Version (version)
options :: IO [OptType]
options = do
luxi <- oLuxiSocket
return
[ luxi
, oJobDelay
, oReason
, oDryRun
]
arguments :: [ArgCompletion]
arguments = []
annotateOpCode :: Maybe String -> Timestamp -> OpCode -> MetaOpCode
annotateOpCode reason ts =
over (metaParamsL . opReasonL)
(++ [( "harep", fromMaybe ("harep " ++ version ++ " called") reason
, reasonTrailTimestamp ts)])
. setOpComment ("automated repairs by harep " ++ version)
. wrapOpCode
processPending :: Options -> L.Client -> InstanceData -> IO InstanceData
processPending opts client instData =
case arState instData of
(ArPendingRepair arData) -> do
sts <- L.queryJobsStatus client $ arJobs arData
time <- getClockTime
case sts of
Bad e -> exitErr $ "could not check job status: " ++ formatError e
Ok sts' ->
if any (<= JOB_STATUS_RUNNING) sts' then
return instData
else do
let iname = Instance.name $ arInstance instData
srcSt = arStateName $ arState instData
destSt = arStateName arState'
putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
show destSt)
commitChange opts client instData'
where
instData' =
instData { arState = arState'
, tagsToRemove = delCurTag instData
}
arState' =
if all (== JOB_STATUS_SUCCESS) sts' then
ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
, arTime = time })
else
ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
, arTime = time })
_ -> return instData
commitChange :: Options -> L.Client -> InstanceData -> IO InstanceData
commitChange opts client instData = do
now <- currentTimestamp
let iname = Instance.name $ arInstance instData
arData = getArData $ arState instData
rmTags = tagsToRemove instData
execJobsWaitOk' opcodes = unless (optDryRun opts) $ do
res <- execJobsWaitOk
[map (annotateOpCode (optReason opts) now) opcodes] client
case res of
Ok _ -> return ()
Bad e -> exitErr e
when (isJust arData) $ do
let tag = arTag $ fromJust arData
putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
execJobsWaitOk' [OpTagsSet TagKindInstance [tag] (Just iname)]
unless (null rmTags) $ do
putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
unlines (map show rmTags))
execJobsWaitOk' [OpTagsDel TagKindInstance rmTags (Just iname)]
return instData { tagsToRemove = [] }
submitJobs' :: Options -> [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
submitJobs' opts jobs client =
if optDryRun opts
then do
putStrLn . (++) "jobs: " . J.encode $ map (map metaOpCode) jobs
return $ Ok []
else
submitJobs jobs client
doRepair :: Options
-> L.Client
-> Double
-> InstanceData
-> (AutoRepairType, [OpCode])
-> IO InstanceData
doRepair opts client delay instData (rtype, opcodes) =
let inst = arInstance instData
ipol = Instance.arPolicy inst
iname = Instance.name inst
in
case ipol of
ArEnabled maxtype ->
if rtype > maxtype then do
uuid <- newUUID
time <- getClockTime
let arState' = ArNeedsRepair (
updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
instData' = instData { arState = arState'
, tagsToRemove = delCurTag instData
}
putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
iname ++ " because only repairs up to " ++ show maxtype ++
" are allowed")
commitChange opts client instData'
else do
now <- currentTimestamp
putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
let opcodes' =
if delay > 0 then
OpTestDelay { opDelayDuration = delay
, opDelayOnMaster = True
, opDelayOnNodes = []
, opDelayOnNodeUuids = Nothing
, opDelayRepeat = fromJust $ mkNonNegative 0
, opDelayInterruptible = False
, opDelayNoLocks = False
} : opcodes
else
opcodes
uuid <- newUUID
time <- getClockTime
jids <- submitJobs'
opts
[map (annotateOpCode (optReason opts) now) opcodes']
client
case jids of
Bad e -> exitErr e
Ok jids' ->
let arState' = ArPendingRepair (
updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
instData' = instData { arState = arState'
, tagsToRemove = delCurTag instData
}
in
commitChange opts client instData'
otherSt -> do
putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
show otherSt)
return instData
main :: Options -> [String] -> IO ()
main opts args = do
unless (null args) $
exitErr "this program doesn't take any arguments."
luxiDef <- Path.defaultQuerySocket
let master = fromMaybe luxiDef $ optLuxi opts
opts' = opts { optLuxi = Just master }
(ClusterData _ nl il _ _) <- loadExternalData opts'
let iniDataRes = mapM setInitialState $ Container.elems il
iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
iniData' <- bracket (L.getLuxiClient master) L.closeClient $
forM iniData . processPending opts
let repairs = map (detectBroken nl . arInstance) iniData'
let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
jobDelay = optJobDelay opts
repairHealthy c i = case arState i of
ArHealthy _ -> doRepair opts c jobDelay i
_ -> const (return i)
repairDone <- bracket (L.getLuxiClient master) L.closeClient $
forM (zip iniData' repairs) . maybeRepair
let states = map ((, 1 :: Int) . arStateName . arState) repairDone
counts = Map.fromListWith (+) states
putStrLn "---------------------"
putStrLn "Instance status count"
putStrLn "---------------------"
putStr . unlines . Map.elems $
Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts