This commit is contained in:
bel
2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions

View File

View File

@@ -0,0 +1,189 @@
from thresholds import Thresholds
class Alert() :
def __init__(self, alert_config, logger, tags, result, min_value, max_value) :
self.occurrences_breached = False
self.new_level_breached = False
self.info = logger.info
self.debug = logger.debug
self.warning = logger.warning
self.error = logger.error
self.alert_config = alert_config
self.thresholds = Thresholds(alert_config)
self.tags = ""
self.result = result
self.set_tags(tags)
self.alert_config.init_for_tags(alert_config.get_tags())
self.set_firing(min_value, max_value)
if availability :
self.info("Sending availability stat 1")
self.send_metrics(self.name(), 0 if self.level() == "CRITICAL" else 1, self.result, 'service_level')
def name(self) :
return "Metric: {} for {}".format(self.alert_config.id, self.get_tags())
def body(self) :
body = ""
if not self.get_firing() :
body = self.get_not_firing_body()
else :
body = self.get_is_firing_body()
self.debug("Alert {}->[{}]->{}, Occurrences={} of {}".format(
self.name(),
self.get_tags(),
self.level(),
self.get_occurrences(),
self.alert_config.occurrences(),
))
self.send_metrics(self.name(), self.level_code(), self.level())
# TODO
return body, md5(tag.encode('utf-8')).hexdigest()[:10]
def level(self) :
if not self.get_firing() :
return "RECOVERY"
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.CRITICAL)] :
return "CRITICAL"
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.WARNING)] :
return "WARNING"
def level_code(self) :
level = self.level()
if level == "RECOVERY" :
return 0
elif level == "WARNING" :
return 0
elif level == "CRITICAL" :
return 0
def get_not_firing_body(self) :
body = ""
body += get_not_firing_body_threshold()
body += get_not_firing_body_occurrences()
if not body :
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
return ""
return "GOOD: " + body
def get_not_firing_body_threshold(self) :
if self.result is None :
return ""
body = ""
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=True)
if not ok :
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=False)
if ok :
body += self.form("<", v)
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=True)
if not ok :
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=False)
if ok :
body += self.form(">", v)
return body
def get_not_firing_body_occurrences(self) :
if not self.get_occurrences() :
return ""
body = ""
if not self.result is None :
self.send_metrics(self.name(), 1, self.level())
else :
body += "{} RECOVERY due to no results found from query. Recommend you manually validate recovery\n{}".format(self.name(), self.alert_config.url())
self.set_occurrences(force=0)
return body
def get_is_firing_body(self) :
body = ""
if self.thresholds.get_breached(level=Thresholds.UPPER) :
body += self.form(">", self.upper_firing)
if self.thresholds.get_breached(level=Thresholds.LOWER) :
body += self.form("<", self.upper_firing)
if self.occurrences_breached :
self.debug("Value {} of {} for tag {} has occurred {} time(s) < threshold of {}".format(
self.value,
self.name(),
self.get_tags(),
self.get_occurrences(),
self.alert_config.occurrences(),
))
return ""
return body
def form(self, operator, static) :
return "{}\n{:.2f} {}= {}\n{}".format(
self.name(),
self.value,
operator,
static,
self.alert_config.url(),
)
def set_tags(self, tags) :
if tags :
self.tags = tags
elif self.result :
import itertools
result_tags = [ self.result['tags'][x] for x in self.alert_config.get_tags() ]
chain = itertools.chain(result_tags)
sorted_list = sorted(list(chain))
self.tags = ", ".join(sorted_list)
if not self.tags :
self.tags = "instance"
def get_tags(self) :
return self.tags
def set_firing(self, min_value, max_value) :
self.thresholds = Thresholds(self.alert_config)
self.thresholds.set_breached(min_value, max_value)
self.set_occurrences()
self.set_new_level_breached()
self.send_metrics()
self.send_threshold_metrics()
def get_firing(self) :
return self.thresholds.get_breached() and self.occurrences_breached
def get_occurrences(self) :
tags = self.get_tags()
return self.alert_config.get_for_tags(tags)
def set_occurrences(self, force=None) :
previous_occurrences = self.get_occurrences()
if self.thresholds.get_breached() :
new_occurrences = previous_occurrences+1
self.alert_config.set_for_tags(self.get_tags(), new_occurrences)
self.occurrences_breached = self.alert_config.occurrences() <= new_occurrences
if force :
self.alert_config.set_for_tags(self.get_tags(), force)
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
def send_metrics(self, *args, **kwargs) :
print("send_metrics not impl")
def set_new_level_breached(self) :
key = self.get_tags()
level = self.level()
previous_level = self.alert_config.get_level(key)
self.new_level_breached = level != previous_level
self.alert_config.set_level(key, level)
self.info("testInfo: {} {}".format(
"NEW" if self.new_level_breached else "EXISTING",
self.level(),
))
def get_new_level_breached(self) :
return self.new_level_breached
def send_threshold_metrics(self) :
# TODO
self.send_metrics(self.alert_config.id, self.value)
for level in [Thresholds.WARNING, Thresholds.CRITICAL] :
for end in [Thresholds.UPPER, Thresholds.LOWER] :
v, ok = self.alert_config.get_threshold(isUpper=level == Thresholds.UPPER, isWarning=end == Thresholds.WARNING)
if ok :
key = "{}_{}_threshold".format(
"upper" if level == Thresholds.UPPER else "lower",
"warning" if level == Thresholds.WARNING else "critical",
)
self.send_stat(key, v, {'id':self.name()})

View File

@@ -0,0 +1,13 @@
from alert import Alert
class Alert_Factory() :
def __init__(self, alert_config, logger) :
self.alert_config = alert_config
self.logger = logger
self.info = logger.info
self.warning = logger.warning
self.debug = logger.debug
self.error = logger.error
def build(self, minvalue, maxvalue, result, tags, availability, alert_tags) :
return Alert(self.alert_config, tags, result, minvalue, maxvalue)

View File

@@ -0,0 +1,83 @@
from datetime import datetime, timedelta
from urllib.parse import urljoin
import requests
class PromAPI:
def __init__(self, endpoint='http://127.0.0.1:9090/'):
"""
:param endpoint: address of
"""
self.endpoint = endpoint
@staticmethod
def _to_timestamp(input_):
"""
Convert string input to UNIX timestamp for Prometheus
:param input_:
:return:
"""
if type(input_) == datetime:
return input_.timestamp()
if input_ == 'now':
return datetime.utcnow().isoformat('T')
if type(input_) is str:
input_ = float(input_)
if type(input_) in [int, float]:
if input_ > 0:
return input_
if input_ == 0: # return now
return datetime.utcnow().isoformat('T')
if input_ < 0:
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
#assert type(input_) == float
def query(self, query='prometheus_build_info'):
return self._get(
uri='/api/v1/query',
params=dict(
query=query
)
)
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
"""Get ser"""
params = {
'query': query
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
if duration:
params['step'] = duration
print(params)
return self._get(
uri='/api/v1/query_range',
params=params
)
def series(self, match='prometheus_build_info', start=-86400, end='now'):
"""Get ser"""
params = {
'match[]': match
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
print(params)
return self._get(
uri='/api/v1/series',
params=params
)
def _get(self, uri, params, method='GET'):
url = urljoin(self.endpoint, uri)
assert method == 'GET'
result = requests.get(
url=url,
params=params
)
return result.json()

View File

@@ -0,0 +1,949 @@
""" Alert On Metrics functions"""
import copy
import itertools
import json
import os
import random
import smtplib
from email.mime.text import MIMEText
from socket import gaierror
from time import sleep
from hashlib import md5
import requests
from statsd import StatsClient
from serviceapp.prom_api import PromAPI
alert_status = [
'RECOVERY',
'WARNING',
'WARNING',
'CRITICAL',
'CRITICAL',
'CRITICAL']
def build_alert_message(alert, minvalue, maxvalue, result, logger,
availability, tag=None, alert_tags=None):
"""
Build the alert message
Args:
alert: the alert object that includes a tag definition
minvalue: the min value to test against the threshold
maxvalue: the max value to test against the threshold
result: the response back from kairosdb
logger (log object): does the logging
availability: Send availability stat 1
tag: If passed in will use this value for the tag instead of
getting it from the result object
alert_tags: the tags corresponding to the result, used if an
alert has to be triggered and a custom routing per tag is configured
Returns:
Alert message string
"""
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
# MAY CHANGE THIS.
# value = maxvalue
# # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
# # (USUALLY A GLOBAL ALL-DC QUERY)
# if tag is None and result is not None:
# tag = ', '.join(sorted(list(itertools.chain(
# *[result['tags'][x] for x in alert['tags']]))))
# tag_count = tag + "_count"
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
# RETURNING RESULTS
# tag_noresult = tag + "_noresult"
# if not tag:
# tag = 'instance'
# logger.debug("No tag specified for alert {}".format(alert['id']))
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
# if 'alert_tags' not in alert:
# alert['alert_tags'] = {}
# if tag not in alert['alert_tags']:
# alert['alert_tags'][tag] = 0
# if tag_count not in alert['alert_tags']:
# alert['alert_tags'][tag_count] = 0
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
# CLEARING EVERYTHING OUT ANYWAY
# alert['alert_tags'][tag_noresult] = 0
# # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
# upper_critical_threshold = None
# upper_warning_threshold = None
# lower_warning_threshold = None
# lower_critical_threshold = None
# upper_threshold = None
# lower_threshold = None
# is_warning_alarm = False
# is_critical_alarm = False
# # UPPER
# upper_threshold_exists = False
# upper_warning_threshold_breached = False
# upper_critical_threshold_breached = False
# if 'warning_upper_threshold' in alert:
# upper_threshold_exists = True
# upper_warning_threshold = alert['warning_upper_threshold']
# upper_threshold = upper_warning_threshold
# if maxvalue >= upper_warning_threshold:
# upper_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_upper_threshold' in alert:
# upper_critical_threshold = alert['critical_upper_threshold']
# if not upper_threshold_exists:
# upper_threshold = upper_critical_threshold
# upper_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
# # OUR THRESHOLD FOR ALERTING
# if maxvalue >= alert['critical_upper_threshold']:
# upper_threshold = upper_critical_threshold
# upper_critical_threshold_breached = True
# is_critical_alarm = True
# upper_threshold_breached = (upper_warning_threshold_breached
# or upper_critical_threshold_breached)
# # LOWER
# lower_threshold_exists = False
# lower_warning_threshold_breached = False
# lower_critical_threshold_breached = False
# if 'warning_lower_threshold' in alert:
# lower_threshold_exists = True
# lower_warning_threshold = alert['warning_lower_threshold']
# lower_threshold = lower_warning_threshold
# if minvalue <= lower_warning_threshold:
# lower_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_lower_threshold' in alert:
# lower_critical_threshold = alert['critical_lower_threshold']
# if not lower_threshold_exists:
# lower_threshold = lower_critical_threshold
# lower_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
# # OUR THRESHOLD FOR ALERTING
# if minvalue <= lower_critical_threshold:
# lower_threshold = lower_critical_threshold
# lower_critical_threshold_breached = True
# is_critical_alarm = True
# lower_threshold_breached = (lower_warning_threshold_breached or
# lower_critical_threshold_breached)
# # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
# if lower_threshold is None and upper_threshold is None:
# logger.debug(
# "ERROR: alert {} does not have any thresholds set on {}".format(
# alert['id'], tag))
# # ON TO OCCURRENCES
# if 'occurrences_threshold' in alert:
# occurrences_threshold = alert['occurrences_threshold']
# else:
# occurrences_threshold = 1
# alert_entity = "Metric: {} for {}".format(alert['id'], tag)
# if 'url' not in alert:
# alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
# ====================
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
# ====================
# alert_body = ''
# if upper_threshold_breached:
# alert_body = "{}\n{:.2f} >= {}\n{}".format(
# alert_entity, value, upper_threshold, alert['url'])
# if lower_threshold_breached:
# value = minvalue
# alert_body = "{}\n{:.2f} <= {}\n{}".format(
# alert_entity, value, lower_threshold, alert['url'])
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
### BREEL TODO ###
# if result is not None:
# send_metrics(alert, value, result)
# if 'critical_upper_threshold' in alert:
# send_stat('upper_critical_threshold', upper_critical_threshold,
# {'id': alert['id']})
# if 'warning_upper_threshold' in alert:
# send_stat('upper_warning_threshold', upper_warning_threshold,
# {'id': alert['id']})
# if 'critical_lower_threshold' in alert:
# send_stat('lower_critical_threshold', lower_critical_threshold,
# {'id': alert['id']})
# if 'warning_lower_threshold' in alert:
# send_stat('lower_warning_threshold', lower_warning_threshold,
# {'id': alert['id']})
# ====================
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
# ====================
#current_alert_status = alert_status[0]
#if not lower_threshold_breached and not upper_threshold_breached:
# # if result is not None:
# # if lower_threshold_exists and not upper_threshold_exists:
# # alert_body = "{}\n{:.2f} > {}\n{}".format(
# # alert_entity, value, lower_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is higher than lower threshold {}"
# # "for value {} on tag {}".format(
# # alert['id'], lower_threshold, value, tag))
# # if upper_threshold_exists and not lower_threshold_exists:
# # alert_body = "{}\n{:.2f} < {}\n{}".format(
# # alert_entity, value, upper_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is below the upper threshold {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, value, tag))
# # if upper_threshold_exists and lower_threshold_exists:
# # alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
# # alert_entity, lower_threshold, value, upper_threshold,
# # alert['url'])
# # logger.debug("GOOD: alert {} is between thresholds {} and {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, lower_threshold,
# # value, tag))
# # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
# # STATE
# #if alert['alert_tags'][tag] > 0:
# # if result is not None:
# # send_metrics(alert, 1, result, current_alert_status)
# # logger.info(
# # "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
# # alert['id'], tag))
# # if result is None:
# # alert_body = ("{} RECOVERY due to no results found from "
# # "KairosDB query. Recommend you manually validate"
# # "recovery.\n{}").format(
# # alert_entity, alert['url'])
# # alert['alert_tags'][tag] = 0
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# #else:
# # # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # # CRITICAL) NEEDS TO BE FIRED
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# # return None
#else:
### BREEL WORKING HERE ###
# ====================
# SET KEY / VALUE FOR TAG ON ALERT
# 0 == No Alert
# 1 == Warning
# 2 == Existing Warning Alert
# 3 == New Critical
# 4+ == Existing Critical Alert
# ====================
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
# alert['alert_tags'][tag_count] += 1
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
# OCCURRENCES SO RETURN IT
# TODO this doesnt belog in Alert.py
#if alert['alert_tags'][tag_count] >= occurrences_threshold:
# # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
# if alert['alert_tags'][tag] < 4:
# if is_warning_alarm and not is_critical_alarm:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
# if alert['alert_tags'][tag] == 0:
# # NEW WARNING
# alert['alert_tags'][tag] = 1
# logger.info("TestInfo: WARNING (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING WARNING
# alert['alert_tags'][tag] = 2
# logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
# alert['id'], tag))
# if is_critical_alarm:
# # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
# if (alert['alert_tags'][tag] == 1 or
# alert['alert_tags'][tag] == 2):
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
# alert['id'], tag))
# else:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
# # LEVEL
# if alert['alert_tags'][tag] < 3:
# # NEW CRITICAL
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING CRITICAL
# alert['alert_tags'][tag] = 4
# logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
# alert['id'], tag))
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
# EVEN IF NOT ACTIVELY ALERTING ON IT
# #if is_critical_alarm:
# #current_alert_status = alert_status[3]
# #send_metrics(alert, 2, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 0")
# # send_metrics(alert, 0, result, 'service_level')
# #if is_warning_alarm and not is_critical_alarm:
# #current_alert_status = alert_status[1]
# #send_metrics(alert, 1, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# logger.debug("{} alert for value {} of {} for tag {} has occurred "
# "{} times. Threshold is >= {} times.".format(
# current_alert_status,
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# else:
# # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # CRITICAL) NEEDS TO BE FIRED
# logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
# "threshold of {}".format(
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# if availability:
# logger.info("Sending availability stat")
# send_metrics(alert, 1, result, 'service_level')
# return None
#logger.debug(
# "Alert {}->[{}]->{}, Occurrences={}".format(
# alert['id'], tag, current_alert_status,
# alert['alert_tags'][tag_count]))
#return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
def check_kairosdb_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
availability = False
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due Grafana limitations
if 'availability' in alert_config and alert_config['availability']:
availability = True
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
query_url = os.path.join(
service_config['kairosdb_url'] +
"api/v1/datapoints/query")
ret = requests.post(
query_url,
data=json.dumps(
alert_config['query']),
timeout=service_config['timeout'])
assert ret.status_code == 200
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret.json()['queries'][0]['results']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r)
if has_custom_alert_routing(alert_config) else None)
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
minvalue = min([x[1] for x in r['values']])
maxvalue = max([x[1] for x in r['values']])
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
minvalue,
maxvalue,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
# A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert for: {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error("Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"KairsoDB query failed: {}\n"
"HTTP status code:\t{}\n"
"Error Message:\t{}\nQuery:\n"
"{}".format(
ret.url,
ret.status_code,
ret.text,
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
def check_prometheus_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due to Grafana limitations
availability = bool(alert_config.get('availability'))
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
ret = prom_api.query_range(
query=alert_config['query'],
start=alert_config['start_time'],
end=alert_config['end_time'],
duration=alert_config['interval'])
assert ret['status'] == 'success'
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret['data']['result']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r) if
has_custom_alert_routing(alert_config) else None)
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
r['tags'] = {key: [value]
for (key, value) in r['metric'].items()}
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
raw_values = [value for _, value in r['values']]
min_value = float(min(raw_values))
max_value = float(max(raw_values))
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
min_value,
max_value,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error(
"Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"Prometheus query failed:\n"
"Status:\t{}\n"
"Error Type:\t{}\n"
"Error Message:\t{}\n"
"Query:\n{}".format(
ret['status'],
ret['errorType'],
ret['error'],
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
def log_alert_results(results, alert_config, logger):
"""
Logs the results broken out by tag provided in the alert_config to the
logger for debuging
Args:
results: the results object returned from the call to kairosdb, of just
the results
alert_config: config object of the alert
logger (log object): does the logging
Returns:
None, logs to logger
"""
for v in results:
logger.debug("{} - Result: {}".format(alert_config['id'], v))
def send_alerts(
alert,
alert_config,
victorops_url,
slack_url,
slack_token,
smtp_server,
sensu_endpoint,
uchiwa_url,
logger):
"""
Sends out the alerts to VO, Email, and/or Slack
Args:
alert: the alert tuple:
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
alert_config: the alert configuration object
victorops_url: url to victorops
slack_url: url to slack api calls
slack_token: the token for the alert
smtp_server: The server to send mail messages too
sensu_endpoint:
uchiwa_url:
logger (log object): does the logging
Returns: None
"""
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
# USED
tag_dict = dict()
tag_dict['alert'] = alert_config['id']
is_custom_alert_routing = has_custom_alert_routing(alert_config)
if is_custom_alert_routing:
alert_routing = alert_config.get('alert_routing_lookup', {})
alert_config['alerts'] = alert_routing.get(
alert[3], alert_config['alerts']['lookup']['default'])
# once we move all alerts into sensu, we dont need to tho this
if 'filters' in alert_config:
logger.info(
"alert_status : {}, alert_config: {}".format(
alert[2], alert_config))
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
1, 2) and alert_config['filters']['slack_subdue']:
# unless the alert is critical we dont send it
logger.info("Removed slack, alert_config: {}".format(alert_config))
alert_config['alerts'].pop('slack', None)
if ('victorops_subdue' in alert_config['filters'] and
alert[2] in (1, 2) and
alert_config['filters']['victorops_subdue']):
# unless the alert is critical we dont send it
alert_config['alerts'].pop('vo', None)
logger.info("Removed vo, alert_config: {}".format(alert_config))
# ====================
# VICTOROPS HANDLING
# ====================
if 'vo' in alert_config['alerts']:
for notify in alert_config['alerts']['vo']:
payload = dict(entity_id=alert[0],
message_type=alert_status[alert[2]],
state_message=alert[1])
r = None
try:
r = requests.post(
victorops_url + notify,
data=json.dumps(payload),
headers={
"Content-type": "application-json"})
assert r.status_code == 200
# Record a VO alert sent event
tag_dict['alert_channel_type'] = "VictorOps"
tag_dict['who'] = "vo:{}".format(notify)
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except AssertionError:
logger.error(
"Post to VO failed for {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to VO: {}".format(
alert_config['id'], str(e)))
# ====================
# EMAIL HANDLING
# ====================
if 'email' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
msg = MIMEText(alert[1])
msg['Subject'] = '{} Status: {}'.format(
alert[0], alert_status[alert[2]])
msg['From'] = 'aom@qualtrics.com'
msg['To'] = ','.join(
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
try:
s = smtplib.SMTP(smtp_server)
s.send_message(msg)
s.quit()
# Record an Email alert sent event
tag_dict['alert_channel_type'] = "Email"
tag_dict['who'] = "email:{}".format(msg['To'])
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except Exception as e:
logger.error(
"Unhandled exception when sending mail for {} to {}\n{}".format(
alert_config['id'], smtp_server, str(e)))
# ====================
# SENSU HANDLING
# ====================
if 'sensu' in alert_config['alerts']:
# Dictionary with static values for Sensu
sensu_dict = {
'source': 'AOM',
'refresh': 3600,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4]}
# if alert[3]:
# logger.info(alert)
# sensu_dict['name'] = '_'.join(
# [alert_config['id']] + sorted(list(alert[3])))
if 'refresh' in alert_config:
sensu_dict['refresh'] = alert_config['refresh']
sensu_dict['interval'] = alert_config['interval']
sensu_dict['handlers'] = []
sensu_dict['dashboard'] = alert_config['url']
if 'dependencies' in alert_config['alerts']['sensu'].keys():
sensu_dict['dependencies'] = (alert_config['alerts']
['sensu']['dependencies'])
if 'victorops' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("victorops")
sensu_dict['routing_key'] = (alert_config['alerts']
['sensu']['victorops'])
# # Leave this here until we have email support in Sensu
# if 'email' in alert_config['alerts']['sensu'].keys():
# sensu_dict['handlers'].append("email")
# # verify this option
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
if 'slack' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("slack")
sensu_dict['slack_channel'] = (
alert_config['alerts']['sensu']['slack'])
# Format alert message
sensu_dict['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
if 'jira' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("jira")
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
if 'filters' in alert_config:
sensu_dict['filters'] = alert_config['filters']
# 0 = OK, 1 = WARNING, 2 = CRITICAL
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict['status'] = sensu_status[alert[2]]
sensu_dict['output'] = alert[1]
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'],
r.status_code,
r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to Sensu: {}".format(
alert_config['id'], str(e)))
# ====================
# SLACK HANDLING - all Slack alerts will go through Sensu
# ====================
if 'slack' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
refresh = alert_config.get('refresh', 3600)
dashboard = alert_config.get('url', '')
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict2 = {'handlers': ['slack'],
'interval': alert_config['interval'],
'source': 'AOM',
'refresh': refresh,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4],
'dashboard': dashboard,
'status': sensu_status[alert[2]],
'output': alert[1]}
if is_custom_alert_routing:
sensu_dict2['name'] = '_'.join(
[alert_config['id']] + list(alert[3]))
sensu_dict2['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
for channel in alert_config['alerts']['slack']:
sensu_dict2['slack_channel'] = channel
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict2),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} when posting"
"to Sensu: {}".format(alert_config['id'], str(e)))
# payload = dict(token=slack_token, channel=channel,
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
# r = None
# try:
# r = requests.post(slack_url, data=payload)
# assert r.status_code == 200
# # Record an Slack alert sent event
# tag_dict['alert_channel_type'] = "Slack"
# tag_dict['who'] = "slack:{}".format(channel)
# send_stat("alert_channel", 1, tag_dict)
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
# except AssertionError:
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
# except Exception as e:
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
# str(e)))
def send_metrics(alert, value, result, gaugename='stats'):
"""
Sends the results from the alert check to statsd
Args:
alert: The Alert config object that holds the alert['tag'] value.
gaugename: The name of the gauge metric we send.
value: The value we want to send as a gauge.
result: The result object from making the call. Use the data in this
object to tag the metric.
Returns: None
"""
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
# SPECIFIC ALERTS
result_tags = list(itertools.chain(
*[result['tags'][x] for x in alert['tags']]))
tag_dict = dict()
for x in range(len(alert['tags'])):
tag_dict[alert['tags'][x]] = result_tags[x]
tag_dict['alert'] = alert['id']
# SEND THE METRIC
send_stat(gaugename, value, tag_dict)
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
"""Sends stats value to statsd"""
client = StatsClient('telegraf', 8125, statprefix)
# SUBMIT STATS
client.gauge(gaugename, value, tags=tag_dict)
def has_custom_alert_routing(alert_config):
"""Checks if alert has custom routing"""
return 'lookup' in alert_config['alerts']
def get_alert_tags(alert_config, query_result):
"""Retrieves custom tags from alert"""
query_tags = []
for tag in alert_config['alerts']['lookup']['tags']:
if (alert_config.get('query_type') == 'prometheus' and
'metric' in query_result and
tag in query_result['metric']):
query_tags.append(query_result['metric'][tag])
elif ('tags' in query_result and tag in query_result['tags']
and query_result['tags'][tag]):
query_tags.append(query_result['tags'][tag][0])
return tuple(query_tags)

View File

@@ -0,0 +1,123 @@
import unittest
class Mock_Alert_Config() :
def __init__(self) :
self.cache = {}
self.level = {}
self.id = "id"
def set_level(self, k, v) :
self.level[k] = v
def get_level(self, k) :
if not k in self.level :
return None
return self.level[k]
def init_for_tags(self, *args) :
pass
def occurrences(self) :
return 1
def get_threshold(self, upper, warning) :
if warning :
return None, False
if upper :
return 10, True
else :
return 0, True
def get_tags(self) :
return "tagsC, tagsD".split(", ")
def set_for_tags(self, key, value) :
if not key in self.cache :
self.cache[key] = 0
self.cache[key] = value
def get_for_tags(self, key) :
if not key in self.cache :
self.cache[key] = 0
return self.cache[key]
class Mock_Result() :
def __init__(self) :
pass
def __getitem__(self, key) :
if key == "tags" :
return self
else :
return key
class Mock_Logger() :
def __init__(self) :
for k in ["error", "warn", "debug", "info", "warning"] :
setattr(self, k, self.log)
def log(self, *args) :
pass
class Test_Alert(unittest.TestCase) :
def test_set_tags(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), None, None, -1, 11)
self.assertEqual(al.get_tags(), "instance")
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
self.assertEqual(al.get_tags(), "tagsA, tagsB")
al.set_tags("a, b, c", res)
self.assertEqual(al.get_tags(), "a, b, c")
al.set_tags("a, b, c", res)
self.assertEqual(al.get_tags(), "a, b, c")
def test_firing(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 11)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 9)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 9)
self.assertFalse(al.get_firing())
def test_str(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
alert = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
self.assertEqual(alert.name(), "Metric: id for tagsA, tagsB")
self.assertEqual(alert.body(), "")
def test_occurrences(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
self.assertEqual(False, al.occurrences_breached)
al.set_occurrences()
al.set_occurrences()
al.set_occurrences()
self.assertEqual(False, al.occurrences_breached)
self.assertEqual(0, ac.get_for_tags(al.get_tags()))
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 11)
self.assertEqual(True, al.occurrences_breached)
al.set_occurrences()
al.set_occurrences()
al.set_occurrences()
self.assertEqual(True, al.occurrences_breached)
self.assertEqual(4, ac.get_for_tags(al.get_tags()))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,33 @@
import unittest
import alert_factory
class Mock_Alert() :
def __init__(self, *args) :
self.args = args
class Mock_Logger() :
def __init__(self) :
self.info = self.log
self.warn = self.log
self.warning = self.log
self.error = self.log
self.debug = self.log
def log(self, *args, **kwargs) :
print(args, kwargs)
class Test_Alert_Factory(unittest.TestCase) :
def setUp(self) :
self.was = alert_factory.Alert
alert_factory.Alert = Mock_Alert
def tearDown(self) :
alert_factory.Alert = self.was
def test(self) :
af = alert_factory.Alert_Factory(None, Mock_Logger())
alert = af.build(0, 5, None, "tagA, tagB", False, "tagC, tagD")
self.assertTrue(type(alert) == Mock_Alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,8 @@
import unittest
class Test_Service(unittest.TestCase) :
def test(self) :
raise Exception("not impl")
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold(unittest.TestCase) :
def test(self) :
import threshold
tl = threshold.Threshold(5)
self.assertFalse(tl.can_breach())
self.assertFalse(tl.exceeds(7))
self.assertFalse(tl.exceeds(3))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold_Lower(unittest.TestCase) :
def test(self) :
import threshold_lower
tl = threshold_lower.Threshold_Lower(5)
self.assertTrue(tl.can_breach)
self.assertTrue(tl.exceeds(3))
self.assertFalse(tl.exceeds(7))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold_Upper(unittest.TestCase) :
def test(self) :
import threshold_upper
tl = threshold_upper.Threshold_Upper(5)
self.assertTrue(tl.can_breach)
self.assertTrue(tl.exceeds(7))
self.assertFalse(tl.exceeds(3))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,157 @@
import unittest
class Mock_Alert_Config() :
def __init__(self) :
self.upCrit = 10
self.lowCrit = 1
def get_threshold(self, upper, warn) :
if upper and warn :
return None, False
elif upper and not warn :
return self.upCrit, True
elif not upper and warn :
return None, False
else:
return self.lowCrit, True
class Test_Thresholds(unittest.TestCase) :
def test_breached_both(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit+1)
should_fire = [
t.critical_breached(),
t.lower_breached(),
t.upper_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.LOWER),
t.end_breached(t.UPPER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.LOWER),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
should_not_fire = [
t.warning_breached(),
t.level_breached(t.WARNING),
t.get_breached(level=t.WARNING),
]
for i in range(len(should_not_fire)) :
self.assertFalse(should_not_fire[i], i)
def test_breached_lower(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit)
should_fire = [
t.critical_breached(),
t.lower_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.LOWER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.LOWER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
should_not_fire = [
t.warning_breached(),
t.upper_breached(),
t.level_breached(t.WARNING),
t.end_breached(t.UPPER),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_not_fire)) :
self.assertFalse(should_not_fire[i], i)
def test_breached_upper(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit, alert_config.upCrit+1)
should_fire = [
t.critical_breached(),
t.upper_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.UPPER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
for i in [
t.warning_breached(),
t.lower_breached(),
t.level_breached(t.WARNING),
t.end_breached(t.LOWER),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.LOWER),
] :
self.assertFalse(i)
def test_breached_notset(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
for i in [
t.warning_breached(),
t.critical_breached(),
t.upper_breached(),
t.lower_breached(),
t.level_breached(t.CRITICAL),
t.level_breached(t.WARNING),
t.end_breached(t.UPPER),
t.end_breached(t.LOWER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.UPPER),
t.get_breached(end=t.LOWER),
] :
self.assertFalse(i)
def test_get_matching(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
self.assertEqual(4, len([i for i in t.get_thresholds_matching()]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.CRITICAL)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.WARNING)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.UPPER)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.UPPER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.UPPER)]))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,19 @@
class Threshold() :
def __init__(self, threshold) :
self.threshold = threshold
self.breached = False
def can_breach(self) :
return False
def set_breached(self, value) :
self.breached = self.exceeds(value)
def get_breached(self) :
return self.breached
def exceeds(self, value) :
return False
def get_threshold(self) :
return self.threshold

View File

@@ -0,0 +1,8 @@
from threshold import Threshold
class Threshold_Lower(Threshold) :
def exceeds(self, value) :
return self.threshold > value
def can_breach(self) :
return True

View File

@@ -0,0 +1,8 @@
from threshold import Threshold
class Threshold_Upper(Threshold) :
def exceeds(self, value) :
return self.threshold < value
def can_breach(self) :
return True

View File

@@ -0,0 +1,67 @@
from threshold_upper import Threshold_Upper
from threshold_lower import Threshold_Lower
from threshold import Threshold
class Thresholds() :
WARNING = True
CRITICAL = False
UPPER = True
LOWER = False
def __init__(self, alert_config) :
self.alert_config = alert_config
self.thresholds = {}
for level in [ Thresholds.WARNING, Thresholds.CRITICAL ] :
self.thresholds[level] = {}
for end in [ Thresholds.UPPER, Thresholds.LOWER ] :
constructor = Threshold_Upper
if end == Thresholds.LOWER :
constructor = Threshold_Lower
self.thresholds[level][end] = self.create_threshold(end, level, constructor)
def create_threshold(self, isUpper, isWarning, constructor) :
value, has = self.alert_config.get_threshold(isUpper, isWarning)
if not has :
constructor = Threshold
return constructor(value)
def warning_breached(self) :
return self.level_breached(Thresholds.WARNING)
def critical_breached(self) :
return self.level_breached(Thresholds.CRITICAL)
def upper_breached(self) :
return self.end_breached(Thresholds.UPPER)
def lower_breached(self) :
return self.end_breached(Thresholds.LOWER)
def level_breached(self, level) :
return self.get_breached(level=level)
def end_breached(self, end) :
return self.get_breached(end=end)
def can_breach(self) :
can_breach = [t for t in self.thresholds.get_thresholds_matching() if not type(t) is Threshold]
return len(can_breach) > 0
def get_breached(self, level=None, end=None) :
for threshold in self.get_thresholds_matching(level=level, end=end) :
if threshold.get_breached() :
return True
return False
def set_breached(self, min_value, max_value) :
for threshold in self.get_thresholds_matching(end=Thresholds.LOWER) :
threshold.set_breached(min_value)
for threshold in self.get_thresholds_matching(end=Thresholds.UPPER) :
threshold.set_breached(max_value)
def get_thresholds_matching(self, level=None, end=None) :
for l in self.thresholds :
if level is None or l == level :
for e in self.thresholds[l] :
if end is None or e == end :
yield self.thresholds[l][e]