module Ganeti.HTools.Program.Harep
( main
, arguments
, options) where
import Control.Exception (bracket)
import Control.Monad
import Data.Function
import Data.List
import Data.Maybe
import Data.Ord
import System.Time
import qualified Data.Map as Map
import Ganeti.BasicTypes
import Ganeti.Common
import Ganeti.Errors
import Ganeti.Jobs
import Ganeti.OpCodes
import Ganeti.OpParams
import Ganeti.Types
import Ganeti.Utils
import qualified Ganeti.Constants as C
import qualified Ganeti.Luxi as L
import qualified Ganeti.Path as Path
import Ganeti.HTools.CLI
import Ganeti.HTools.Loader
import Ganeti.HTools.ExtLoader
import Ganeti.HTools.Types
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
import qualified Ganeti.HTools.Node as Node
options :: IO [OptType]
options = do
luxi <- oLuxiSocket
return
[ luxi
, oJobDelay
]
arguments :: [ArgCompletion]
arguments = []
data InstanceData = InstanceData { arInstance :: Instance.Instance
, arState :: AutoRepairStatus
, tagsToRemove :: [String]
}
deriving (Eq, Show)
parseInitTag :: String -> Maybe AutoRepairData
parseInitTag tag =
let parsePending = do
subtag <- chompPrefix C.autoRepairTagPending tag
case sepSplit ':' subtag of
[rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
_ -> fail ("Invalid tag: " ++ show tag)
parseResult = do
subtag <- chompPrefix C.autoRepairTagResult tag
case sepSplit ':' subtag of
[rtype, uuid, ts, result, jobs] -> do
arData <- makeArData rtype uuid ts jobs
result' <- autoRepairResultFromRaw result
return arData { arResult = Just result' }
_ -> fail ("Invalid tag: " ++ show tag)
makeArData rtype uuid ts jobs = do
rtype' <- autoRepairTypeFromRaw rtype
ts' <- tryRead "auto-repair time" ts
jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
return AutoRepairData { arType = rtype'
, arUuid = uuid
, arTime = TOD ts' 0
, arJobs = jobs'
, arResult = Nothing
, arTag = tag
}
in
parsePending `mplus` parseResult
getArData :: AutoRepairStatus -> Maybe AutoRepairData
getArData status =
case status of
ArHealthy (Just d) -> Just d
ArFailedRepair d -> Just d
ArPendingRepair d -> Just d
ArNeedsRepair d -> Just d
_ -> Nothing
arStateName :: AutoRepairStatus -> String
arStateName status =
case status of
ArHealthy _ -> "Healthy"
ArFailedRepair _ -> "Failure"
ArPendingRepair _ -> "Pending repair"
ArNeedsRepair _ -> "Needs repair"
delCurTag :: InstanceData -> [String]
delCurTag instData =
let arData = getArData $ arState instData
rmTags = tagsToRemove instData
in
case arData of
Just d -> arTag d : rmTags
Nothing -> rmTags
setInitialState :: Instance.Instance -> Result InstanceData
setInitialState inst =
let arData = mapMaybe parseInitTag $ Instance.allTags inst
arData' = sortBy (comparing arUuid) arData
arGroups = groupBy ((==) `on` arUuid) arData'
arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
in
foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
arStatusCmp instData arData =
let curSt = arState instData
arData' = sortBy (comparing keyfn) arData
keyfn d = (arResult d, arTime d)
newData = last arData'
newSt = case arResult newData of
Just ArSuccess -> ArHealthy $ Just newData
Just ArEnoperm -> ArHealthy $ Just newData
Just ArFailure -> ArFailedRepair newData
Nothing -> ArPendingRepair newData
in
case curSt of
ArFailedRepair _ -> Ok instData
ArHealthy _ -> Ok instData { arState = newSt
, tagsToRemove = delCurTag instData
}
ArPendingRepair d -> Bad (
"An unfinished repair was found in instance " ++
Instance.name (arInstance instData) ++ ": found tag " ++
show (arTag newData) ++ ", but older pending tag " ++
show (arTag d) ++ "exists.")
ArNeedsRepair _ -> Bad
"programming error: ArNeedsRepair found as an initial state"
processPending :: L.Client -> InstanceData -> IO InstanceData
processPending client instData =
case arState instData of
(ArPendingRepair arData) -> do
sts <- L.queryJobsStatus client $ arJobs arData
time <- getClockTime
case sts of
Bad e -> exitErr $ "could not check job status: " ++ formatError e
Ok sts' ->
if any (<= JOB_STATUS_RUNNING) sts' then
return instData
else do
let iname = Instance.name $ arInstance instData
srcSt = arStateName $ arState instData
destSt = arStateName arState'
putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
show destSt)
commitChange client instData'
where
instData' =
instData { arState = arState'
, tagsToRemove = delCurTag instData
}
arState' =
if all (== JOB_STATUS_SUCCESS) sts' then
ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
, arTime = time })
else
ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
, arTime = time })
_ -> return instData
updateTag :: AutoRepairData -> AutoRepairData
updateTag arData =
let ini = [autoRepairTypeToRaw $ arType arData,
arUuid arData,
clockTimeToString $ arTime arData]
end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
(pfx, middle) =
case arResult arData of
Nothing -> (C.autoRepairTagPending, [])
Just rs -> (C.autoRepairTagResult, [autoRepairResultToRaw rs])
in
arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
commitChange :: L.Client -> InstanceData -> IO InstanceData
commitChange client instData = do
let iname = Instance.name $ arInstance instData
arData = getArData $ arState instData
rmTags = tagsToRemove instData
execJobsWaitOk' opcodes = do
res <- execJobsWaitOk [map wrapOpCode opcodes] client
case res of
Ok _ -> return ()
Bad e -> exitErr e
when (isJust arData) $ do
let tag = arTag $ fromJust arData
putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
execJobsWaitOk' [OpTagsSet (TagInstance iname) [tag]]
unless (null rmTags) $ do
putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
unlines (map show rmTags))
execJobsWaitOk' [OpTagsDel (TagInstance iname) rmTags]
return instData { tagsToRemove = [] }
detectBroken :: Node.List -> Instance.Instance
-> Maybe (AutoRepairType, [OpCode])
detectBroken nl inst =
let disk = Instance.diskTemplate inst
iname = Instance.name inst
offPri = Node.offline $ Container.find (Instance.pNode inst) nl
offSec = Node.offline $ Container.find (Instance.sNode inst) nl
in
case disk of
DTDrbd8
| offPri && offSec ->
Just (
ArReinstall,
[ OpInstanceRecreateDisks { opInstanceName = iname
, opRecreateDisksInfo = RecreateDisksAll
, opNodes = []
, opIallocator = mkNonEmpty "hail"
}
, OpInstanceReinstall { opInstanceName = iname
, opOsType = Nothing
, opTempOsParams = Nothing
, opForceVariant = False
}
])
| offPri ->
Just (
ArFailover,
[ OpInstanceFailover { opInstanceName = iname
, opShutdownTimeout = fromJust $ mkNonNegative
C.defaultShutdownTimeout
, opIgnoreConsistency = False
, opTargetNode = Nothing
, opIgnoreIpolicy = False
, opIallocator = Nothing
, opMigrationCleanup = False
}
])
| offSec ->
Just (
ArFixStorage,
[ OpInstanceReplaceDisks { opInstanceName = iname
, opReplaceDisksMode = ReplaceNewSecondary
, opReplaceDisksList = []
, opRemoteNode = Nothing
, opIallocator = mkNonEmpty "hail"
, opEarlyRelease = False
, opIgnoreIpolicy = False
}
])
| otherwise -> Nothing
DTPlain
| offPri ->
Just (
ArReinstall,
[ OpInstanceRecreateDisks { opInstanceName = iname
, opRecreateDisksInfo = RecreateDisksAll
, opNodes = []
, opIallocator = mkNonEmpty "hail"
}
, OpInstanceReinstall { opInstanceName = iname
, opOsType = Nothing
, opTempOsParams = Nothing
, opForceVariant = False
}
])
| otherwise -> Nothing
_ -> Nothing
doRepair :: L.Client
-> Double
-> InstanceData
-> (AutoRepairType, [OpCode])
-> IO InstanceData
doRepair client delay instData (rtype, opcodes) =
let inst = arInstance instData
ipol = Instance.arPolicy inst
iname = Instance.name inst
in
case ipol of
ArEnabled maxtype ->
if rtype > maxtype then do
uuid <- newUUID
time <- getClockTime
let arState' = ArNeedsRepair (
updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
instData' = instData { arState = arState'
, tagsToRemove = delCurTag instData
}
putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
iname ++ " because only repairs up to " ++ show maxtype ++
" are allowed")
commitChange client instData'
else do
putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
let opcodes' =
if delay > 0 then
OpTestDelay { opDelayDuration = delay
, opDelayOnMaster = True
, opDelayOnNodes = []
, opDelayRepeat = fromJust $ mkNonNegative 0
} : opcodes
else
opcodes
uuid <- newUUID
time <- getClockTime
jids <- submitJobs [map wrapOpCode opcodes'] client
case jids of
Bad e -> exitErr e
Ok jids' ->
let arState' = ArPendingRepair (
updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
instData' = instData { arState = arState'
, tagsToRemove = delCurTag instData
}
in
commitChange client instData'
otherSt -> do
putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
show otherSt)
return instData
main :: Options -> [String] -> IO ()
main opts args = do
unless (null args) $
exitErr "this program doesn't take any arguments."
luxiDef <- Path.defaultLuxiSocket
let master = fromMaybe luxiDef $ optLuxi opts
opts' = opts { optLuxi = Just master }
(ClusterData _ nl il _ _) <- loadExternalData opts'
let iniDataRes = mapM setInitialState $ Container.elems il
iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
iniData' <- bracket (L.getClient master) L.closeClient $
forM iniData . processPending
let repairs = map (detectBroken nl . arInstance) iniData'
let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
jobDelay = optJobDelay opts
repairHealthy c i = case arState i of
ArHealthy _ -> doRepair c jobDelay i
_ -> const (return i)
repairDone <- bracket (L.getClient master) L.closeClient $
forM (zip iniData' repairs) . maybeRepair
let states = map ((, 1 :: Int) . arStateName . arState) repairDone
counts = Map.fromListWith (+) states
putStrLn "---------------------"
putStrLn "Instance status count"
putStrLn "---------------------"
putStr . unlines . Map.elems $
Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts