module Ganeti.HTools.Program.Harep
( main
, arguments
, options) where
import Control.Exception (bracket)
import Control.Lens (over)
import Control.Monad
import Data.Function
import Data.List
import Data.Maybe
import Data.Ord
import System.Time
import qualified Data.Map as Map
import qualified Text.JSON as J
import Ganeti.BasicTypes
import Ganeti.Common
import Ganeti.Errors
import Ganeti.JQueue (currentTimestamp, reasonTrailTimestamp)
import Ganeti.JQueue.Objects (Timestamp)
import Ganeti.Jobs
import Ganeti.OpCodes
import Ganeti.OpCodes.Lens (metaParamsL, opReasonL)
import Ganeti.OpParams
import Ganeti.Types
import Ganeti.Utils
import qualified Ganeti.Constants as C
import qualified Ganeti.Luxi as L
import qualified Ganeti.Path as Path
import Ganeti.HTools.CLI
import Ganeti.HTools.Loader
import Ganeti.HTools.ExtLoader
import qualified Ganeti.HTools.Tags.Constants as Tags
import Ganeti.HTools.Types
import qualified Ganeti.HTools.Container as Container
import qualified Ganeti.HTools.Instance as Instance
import qualified Ganeti.HTools.Node as Node
import Ganeti.Version (version)
options :: IO [OptType]
options = do
luxi <- oLuxiSocket
return
[ luxi
, oJobDelay
, oReason
, oDryRun
]
arguments :: [ArgCompletion]
arguments = []
annotateOpCode :: Maybe String -> Timestamp -> OpCode -> MetaOpCode
annotateOpCode reason ts =
over (metaParamsL . opReasonL)
(++ [( "harep", fromMaybe ("harep " ++ version ++ " called") reason
, reasonTrailTimestamp ts)])
. setOpComment ("automated repairs by harep " ++ version)
. wrapOpCode
data InstanceData = InstanceData { arInstance :: Instance.Instance
, arState :: AutoRepairStatus
, tagsToRemove :: [String]
}
deriving (Eq, Show)
parseInitTag :: String -> Maybe AutoRepairData
parseInitTag tag =
let parsePending = do
subtag <- chompPrefix Tags.autoRepairTagPending tag
case sepSplit ':' subtag of
[rtype, uuid, ts, jobs] -> makeArData rtype uuid ts jobs
_ -> fail ("Invalid tag: " ++ show tag)
parseResult = do
subtag <- chompPrefix Tags.autoRepairTagResult tag
case sepSplit ':' subtag of
[rtype, uuid, ts, result, jobs] -> do
arData <- makeArData rtype uuid ts jobs
result' <- autoRepairResultFromRaw result
return arData { arResult = Just result' }
_ -> fail ("Invalid tag: " ++ show tag)
makeArData rtype uuid ts jobs = do
rtype' <- autoRepairTypeFromRaw rtype
ts' <- tryRead "auto-repair time" ts
jobs' <- mapM makeJobIdS $ sepSplit '+' jobs
return AutoRepairData { arType = rtype'
, arUuid = uuid
, arTime = TOD ts' 0
, arJobs = jobs'
, arResult = Nothing
, arTag = tag
}
in
parsePending `mplus` parseResult
getArData :: AutoRepairStatus -> Maybe AutoRepairData
getArData status =
case status of
ArHealthy (Just d) -> Just d
ArFailedRepair d -> Just d
ArPendingRepair d -> Just d
ArNeedsRepair d -> Just d
_ -> Nothing
arStateName :: AutoRepairStatus -> String
arStateName status =
case status of
ArHealthy _ -> "Healthy"
ArFailedRepair _ -> "Failure"
ArPendingRepair _ -> "Pending repair"
ArNeedsRepair _ -> "Needs repair"
delCurTag :: InstanceData -> [String]
delCurTag instData =
let arData = getArData $ arState instData
rmTags = tagsToRemove instData
in
case arData of
Just d -> arTag d : rmTags
Nothing -> rmTags
setInitialState :: Instance.Instance -> Result InstanceData
setInitialState inst =
let arData = mapMaybe parseInitTag $ Instance.allTags inst
arData' = sortBy (comparing arUuid) arData
arGroups = groupBy ((==) `on` arUuid) arData'
arGroups' = sortBy (comparing $ minimum . map arTime) arGroups
in
foldM arStatusCmp (InstanceData inst (ArHealthy Nothing) []) arGroups'
arStatusCmp :: InstanceData -> [AutoRepairData] -> Result InstanceData
arStatusCmp instData arData =
let curSt = arState instData
arData' = sortBy (comparing keyfn) arData
keyfn d = (arResult d, arTime d)
newData = last arData'
newSt = case arResult newData of
Just ArSuccess -> ArHealthy $ Just newData
Just ArEnoperm -> ArHealthy $ Just newData
Just ArFailure -> ArFailedRepair newData
Nothing -> ArPendingRepair newData
in
case curSt of
ArFailedRepair _ -> Ok instData
ArHealthy _ -> Ok instData { arState = newSt
, tagsToRemove = delCurTag instData
}
ArPendingRepair d -> Bad (
"An unfinished repair was found in instance " ++
Instance.name (arInstance instData) ++ ": found tag " ++
show (arTag newData) ++ ", but older pending tag " ++
show (arTag d) ++ "exists.")
ArNeedsRepair _ -> Bad
"programming error: ArNeedsRepair found as an initial state"
processPending :: Options -> L.Client -> InstanceData -> IO InstanceData
processPending opts client instData =
case arState instData of
(ArPendingRepair arData) -> do
sts <- L.queryJobsStatus client $ arJobs arData
time <- getClockTime
case sts of
Bad e -> exitErr $ "could not check job status: " ++ formatError e
Ok sts' ->
if any (<= JOB_STATUS_RUNNING) sts' then
return instData
else do
let iname = Instance.name $ arInstance instData
srcSt = arStateName $ arState instData
destSt = arStateName arState'
putStrLn ("Moving " ++ iname ++ " from " ++ show srcSt ++ " to " ++
show destSt)
commitChange opts client instData'
where
instData' =
instData { arState = arState'
, tagsToRemove = delCurTag instData
}
arState' =
if all (== JOB_STATUS_SUCCESS) sts' then
ArHealthy $ Just (updateTag $ arData { arResult = Just ArSuccess
, arTime = time })
else
ArFailedRepair (updateTag $ arData { arResult = Just ArFailure
, arTime = time })
_ -> return instData
updateTag :: AutoRepairData -> AutoRepairData
updateTag arData =
let ini = [autoRepairTypeToRaw $ arType arData,
arUuid arData,
clockTimeToString $ arTime arData]
end = [intercalate "+" . map (show . fromJobId) $ arJobs arData]
(pfx, middle) =
case arResult arData of
Nothing -> (Tags.autoRepairTagPending, [])
Just rs -> (Tags.autoRepairTagResult, [autoRepairResultToRaw rs])
in
arData { arTag = pfx ++ intercalate ":" (ini ++ middle ++ end) }
commitChange :: Options -> L.Client -> InstanceData -> IO InstanceData
commitChange opts client instData = do
now <- currentTimestamp
let iname = Instance.name $ arInstance instData
arData = getArData $ arState instData
rmTags = tagsToRemove instData
execJobsWaitOk' opcodes = unless (optDryRun opts) $ do
res <- execJobsWaitOk
[map (annotateOpCode (optReason opts) now) opcodes] client
case res of
Ok _ -> return ()
Bad e -> exitErr e
when (isJust arData) $ do
let tag = arTag $ fromJust arData
putStrLn (">>> Adding the following tag to " ++ iname ++ ":\n" ++ show tag)
execJobsWaitOk' [OpTagsSet TagKindInstance [tag] (Just iname)]
unless (null rmTags) $ do
putStr (">>> Removing the following tags from " ++ iname ++ ":\n" ++
unlines (map show rmTags))
execJobsWaitOk' [OpTagsDel TagKindInstance rmTags (Just iname)]
return instData { tagsToRemove = [] }
detectBroken :: Node.List -> Instance.Instance
-> Maybe (AutoRepairType, [OpCode])
detectBroken nl inst =
let disk = Instance.diskTemplate inst
iname = Instance.name inst
offPri = Node.offline $ Container.find (Instance.pNode inst) nl
offSec = Node.offline $ Container.find (Instance.sNode inst) nl
in
case disk of
DTDrbd8
| offPri && offSec ->
Just (
ArReinstall,
[ OpInstanceRecreateDisks { opInstanceName = iname
, opInstanceUuid = Nothing
, opRecreateDisksInfo = RecreateDisksAll
, opNodes = []
, opNodeUuids = Nothing
, opIallocator = mkNonEmpty "hail"
}
, OpInstanceReinstall { opInstanceName = iname
, opInstanceUuid = Nothing
, opOsType = Nothing
, opTempOsParams = Nothing
, opOsparamsPrivate = Nothing
, opOsparamsSecret = Nothing
, opForceVariant = False
}
])
| offPri ->
Just (
ArFailover,
[ OpInstanceFailover { opInstanceName = iname
, opInstanceUuid = Nothing
, opShutdownTimeout = fromJust $ mkNonNegative
C.defaultShutdownTimeout
, opIgnoreConsistency = False
, opTargetNode = Nothing
, opTargetNodeUuid = Nothing
, opIgnoreIpolicy = False
, opIallocator = Nothing
, opMigrationCleanup = False
}
])
| offSec ->
Just (
ArFixStorage,
[ OpInstanceReplaceDisks { opInstanceName = iname
, opInstanceUuid = Nothing
, opReplaceDisksMode = ReplaceNewSecondary
, opReplaceDisksList = []
, opRemoteNode = Nothing
, opRemoteNodeUuid = Nothing
, opIallocator = mkNonEmpty "hail"
, opEarlyRelease = False
, opIgnoreIpolicy = False
}
])
| otherwise -> Nothing
DTPlain
| offPri ->
Just (
ArReinstall,
[ OpInstanceRecreateDisks { opInstanceName = iname
, opInstanceUuid = Nothing
, opRecreateDisksInfo = RecreateDisksAll
, opNodes = []
, opNodeUuids = Nothing
, opIallocator = mkNonEmpty "hail"
}
, OpInstanceReinstall { opInstanceName = iname
, opInstanceUuid = Nothing
, opOsType = Nothing
, opTempOsParams = Nothing
, opOsparamsPrivate = Nothing
, opOsparamsSecret = Nothing
, opForceVariant = False
}
])
| otherwise -> Nothing
_ -> Nothing
submitJobs' :: Options -> [[MetaOpCode]] -> L.Client -> IO (Result [JobId])
submitJobs' opts jobs client =
if optDryRun opts
then do
putStrLn . (++) "jobs: " . J.encode $ map (map metaOpCode) jobs
return $ Ok []
else
submitJobs jobs client
doRepair :: Options
-> L.Client
-> Double
-> InstanceData
-> (AutoRepairType, [OpCode])
-> IO InstanceData
doRepair opts client delay instData (rtype, opcodes) =
let inst = arInstance instData
ipol = Instance.arPolicy inst
iname = Instance.name inst
in
case ipol of
ArEnabled maxtype ->
if rtype > maxtype then do
uuid <- newUUID
time <- getClockTime
let arState' = ArNeedsRepair (
updateTag $ AutoRepairData rtype uuid time [] (Just ArEnoperm) "")
instData' = instData { arState = arState'
, tagsToRemove = delCurTag instData
}
putStrLn ("Not performing a repair of type " ++ show rtype ++ " on " ++
iname ++ " because only repairs up to " ++ show maxtype ++
" are allowed")
commitChange opts client instData'
else do
now <- currentTimestamp
putStrLn ("Executing " ++ show rtype ++ " repair on " ++ iname)
let opcodes' =
if delay > 0 then
OpTestDelay { opDelayDuration = delay
, opDelayOnMaster = True
, opDelayOnNodes = []
, opDelayOnNodeUuids = Nothing
, opDelayRepeat = fromJust $ mkNonNegative 0
, opDelayInterruptible = False
, opDelayNoLocks = False
} : opcodes
else
opcodes
uuid <- newUUID
time <- getClockTime
jids <- submitJobs'
opts
[map (annotateOpCode (optReason opts) now) opcodes']
client
case jids of
Bad e -> exitErr e
Ok jids' ->
let arState' = ArPendingRepair (
updateTag $ AutoRepairData rtype uuid time jids' Nothing "")
instData' = instData { arState = arState'
, tagsToRemove = delCurTag instData
}
in
commitChange opts client instData'
otherSt -> do
putStrLn ("Not repairing " ++ iname ++ " because it's in state " ++
show otherSt)
return instData
main :: Options -> [String] -> IO ()
main opts args = do
unless (null args) $
exitErr "this program doesn't take any arguments."
luxiDef <- Path.defaultQuerySocket
let master = fromMaybe luxiDef $ optLuxi opts
opts' = opts { optLuxi = Just master }
(ClusterData _ nl il _ _) <- loadExternalData opts'
let iniDataRes = mapM setInitialState $ Container.elems il
iniData <- exitIfBad "when parsing auto-repair tags" iniDataRes
iniData' <- bracket (L.getLuxiClient master) L.closeClient $
forM iniData . processPending opts
let repairs = map (detectBroken nl . arInstance) iniData'
let maybeRepair c (i, r) = maybe (return i) (repairHealthy c i) r
jobDelay = optJobDelay opts
repairHealthy c i = case arState i of
ArHealthy _ -> doRepair opts c jobDelay i
_ -> const (return i)
repairDone <- bracket (L.getLuxiClient master) L.closeClient $
forM (zip iniData' repairs) . maybeRepair
let states = map ((, 1 :: Int) . arStateName . arState) repairDone
counts = Map.fromListWith (+) states
putStrLn "---------------------"
putStrLn "Instance status count"
putStrLn "---------------------"
putStr . unlines . Map.elems $
Map.mapWithKey (\k v -> k ++ ": " ++ show v) counts