QVolution2019.2/sleeper_agents_aom_engine/serviceapp/service.py

1074 lines
45 KiB
Python
Executable File

""" Alert On Metrics functions"""
import copy
import itertools
import json
import os
import random
import smtplib
from email.mime.text import MIMEText
from socket import gaierror
from time import sleep
from hashlib import md5
import requests
from statsd import StatsClient
import redis
alert_status = [
'RECOVERY',
'WARNING',
'WARNING',
'CRITICAL',
'CRITICAL',
'CRITICAL']
def build_alert_message(alert, minvalue, maxvalue, result, logger,
availability, tag=None, alert_tags=None):
"""
Build the alert message
Args:
alert: the alert object that includes a tag definition
minvalue: the min value to test against the threshold
maxvalue: the max value to test against the threshold
result: the response back from kairosdb
logger (log object): does the logging
availability: Send availability stat 1
tag: If passed in will use this value for the tag instead of
getting it from the result object
alert_tags: the tags corresponding to the result, used if an
alert has to be triggered and a custom routing per tag is configured
Returns:
Alert message string
"""
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
# MAY CHANGE THIS.
value = maxvalue
# HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
# (USUALLY A GLOBAL ALL-DC QUERY)
if tag is None and result is not None:
tag = ', '.join(sorted(list(itertools.chain(
*[result['tags'][x] for x in alert['tags']]))))
tag_count = tag + "_count"
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
# RETURNING RESULTS
tag_noresult = tag + "_noresult"
if not tag:
tag = 'instance'
logger.debug("No tag specified for alert {}".format(alert['id']))
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
if 'alert_tags' not in alert:
alert['alert_tags'] = {}
if tag not in alert['alert_tags']:
alert['alert_tags'][tag] = 0
if tag_count not in alert['alert_tags']:
alert['alert_tags'][tag_count] = 0
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
# CLEARING EVERYTHING OUT ANYWAY
alert['alert_tags'][tag_noresult] = 0
# FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
upper_critical_threshold = None
upper_warning_threshold = None
lower_warning_threshold = None
lower_critical_threshold = None
upper_threshold = None
lower_threshold = None
is_warning_alarm = False
is_critical_alarm = False
# UPPER
upper_threshold_exists = False
upper_warning_threshold_breached = False
upper_critical_threshold_breached = False
if 'warning_upper_threshold' in alert:
upper_threshold_exists = True
upper_warning_threshold = alert['warning_upper_threshold']
upper_threshold = upper_warning_threshold
if maxvalue >= upper_warning_threshold:
upper_warning_threshold_breached = True
is_warning_alarm = True
if 'critical_upper_threshold' in alert:
upper_critical_threshold = alert['critical_upper_threshold']
if not upper_threshold_exists:
upper_threshold = upper_critical_threshold
upper_threshold_exists = True
# IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
# OUR THRESHOLD FOR ALERTING
if maxvalue >= alert['critical_upper_threshold']:
upper_threshold = upper_critical_threshold
upper_critical_threshold_breached = True
is_critical_alarm = True
upper_threshold_breached = (upper_warning_threshold_breached
or upper_critical_threshold_breached)
# LOWER
lower_threshold_exists = False
lower_warning_threshold_breached = False
lower_critical_threshold_breached = False
if 'warning_lower_threshold' in alert:
lower_threshold_exists = True
lower_warning_threshold = alert['warning_lower_threshold']
lower_threshold = lower_warning_threshold
if minvalue <= lower_warning_threshold:
lower_warning_threshold_breached = True
is_warning_alarm = True
if 'critical_lower_threshold' in alert:
lower_critical_threshold = alert['critical_lower_threshold']
if not lower_threshold_exists:
lower_threshold = lower_critical_threshold
lower_threshold_exists = True
# IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
# OUR THRESHOLD FOR ALERTING
if minvalue <= lower_critical_threshold:
lower_threshold = lower_critical_threshold
lower_critical_threshold_breached = True
is_critical_alarm = True
lower_threshold_breached = (lower_warning_threshold_breached or
lower_critical_threshold_breached)
# THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
if lower_threshold is None and upper_threshold is None:
logger.debug(
"ERROR: alert {} does not have any thresholds set on {}".format(
alert['id'], tag))
# ON TO OCCURRENCES
if 'occurrences_threshold' in alert:
occurrences_threshold = alert['occurrences_threshold']
else:
occurrences_threshold = 1
alert_entity = "Metric: {} for {}".format(alert['id'], tag)
if 'url' not in alert:
alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
# ====================
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
# ====================
alert_body = ''
if upper_threshold_breached:
alert_body = "{}\n{:.2f} >= {}\n{}".format(
alert_entity, value, upper_threshold, alert['url'])
if lower_threshold_breached:
value = minvalue
alert_body = "{}\n{:.2f} <= {}\n{}".format(
alert_entity, value, lower_threshold, alert['url'])
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
if result is not None:
send_metrics(alert, value, result)
if 'critical_upper_threshold' in alert:
send_stat('upper_critical_threshold', upper_critical_threshold,
{'id': alert['id']})
if 'warning_upper_threshold' in alert:
send_stat('upper_warning_threshold', upper_warning_threshold,
{'id': alert['id']})
if 'critical_lower_threshold' in alert:
send_stat('lower_critical_threshold', lower_critical_threshold,
{'id': alert['id']})
if 'warning_lower_threshold' in alert:
send_stat('lower_warning_threshold', lower_warning_threshold,
{'id': alert['id']})
# NO RESULT OVERRIDES ALL
if result is None:
lower_threshold_breached = False
upper_threshold_breached = False
# ====================
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
# ====================
current_alert_status = alert_status[0]
if not lower_threshold_breached and not upper_threshold_breached:
if result is not None:
if lower_threshold_exists and not upper_threshold_exists:
alert_body = "{}\n{:.2f} > {}\n{}".format(
alert_entity, value, lower_threshold, alert['url'])
logger.debug("GOOD: alert {} is higher than lower threshold {}"
"for value {} on tag {}".format(
alert['id'], lower_threshold, value, tag))
if upper_threshold_exists and not lower_threshold_exists:
alert_body = "{}\n{:.2f} < {}\n{}".format(
alert_entity, value, upper_threshold, alert['url'])
logger.debug("GOOD: alert {} is below the upper threshold {} "
"for value {} on tag {}".format(
alert['id'], upper_threshold, value, tag))
if upper_threshold_exists and lower_threshold_exists:
alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
alert_entity, lower_threshold, value, upper_threshold,
alert['url'])
logger.debug("GOOD: alert {} is between thresholds {} and {} "
"for value {} on tag {}".format(
alert['id'], upper_threshold, lower_threshold,
value, tag))
# CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
# STATE
if alert['alert_tags'][tag] > 0:
if result is not None:
send_metrics(alert, 1, result, current_alert_status)
logger.info(
"TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
alert['id'], tag))
if result is None:
alert_body = ("{} RECOVERY due to no results found from "
"KairosDB query. Recommend you manually validate"
"recovery.\n{}").format(
alert_entity, alert['url'])
alert['alert_tags'][tag] = 0
alert['alert_tags'][tag_count] = 0
if availability:
logger.info("Sending availability stat 1")
send_metrics(alert, 1, result, 'service_level')
else:
# WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# CRITICAL) NEEDS TO BE FIRED
alert['alert_tags'][tag_count] = 0
if availability:
logger.info("Sending availability stat 1")
send_metrics(alert, 1, result, 'service_level')
return None
else:
# ====================
# SET KEY / VALUE FOR TAG ON ALERT
# 0 == No Alert
# 1 == Warning
# 2 == Existing Warning Alert
# 3 == New Critical
# 4+ == Existing Critical Alert
# ====================
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
alert['alert_tags'][tag_count] += 1
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
# OCCURRENCES SO RETURN IT
if alert['alert_tags'][tag_count] >= occurrences_threshold:
# >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
if alert['alert_tags'][tag] < 4:
if is_warning_alarm and not is_critical_alarm:
# THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
if alert['alert_tags'][tag] == 0:
# NEW WARNING
alert['alert_tags'][tag] = 1
logger.info("TestInfo: WARNING (NEW): {} - {}".format(
alert['id'], tag))
else:
# EXISTING WARNING
alert['alert_tags'][tag] = 2
logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
alert['id'], tag))
if is_critical_alarm:
# THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
if (alert['alert_tags'][tag] == 1 or
alert['alert_tags'][tag] == 2):
alert['alert_tags'][tag] = 3
logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
alert['id'], tag))
else:
# THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
# LEVEL
if alert['alert_tags'][tag] < 3:
# NEW CRITICAL
alert['alert_tags'][tag] = 3
logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
alert['id'], tag))
else:
# EXISTING CRITICAL
alert['alert_tags'][tag] = 4
logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
alert['id'], tag))
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
# EVEN IF NOT ACTIVELY ALERTING ON IT
if is_critical_alarm:
current_alert_status = alert_status[3]
send_metrics(alert, 2, result, current_alert_status)
if availability:
logger.info("Sending availability stat 0")
send_metrics(alert, 0, result, 'service_level')
if is_warning_alarm and not is_critical_alarm:
current_alert_status = alert_status[1]
send_metrics(alert, 1, result, current_alert_status)
if availability:
logger.info("Sending availability stat 1")
send_metrics(alert, 1, result, 'service_level')
logger.debug("{} alert for value {} of {} for tag {} has occurred "
"{} times. Threshold is >= {} times.".format(
current_alert_status,
value,
alert['id'],
tag,
alert['alert_tags'][tag_count],
occurrences_threshold))
else:
# WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# CRITICAL) NEEDS TO BE FIRED
logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
"threshold of {}".format(
value,
alert['id'],
tag,
alert['alert_tags'][tag_count],
occurrences_threshold))
if availability:
logger.info("Sending availability stat")
send_metrics(alert, 1, result, 'service_level')
return None
logger.debug(
"Alert {}->[{}]->{}, Occurrences={}".format(
alert['id'], tag, current_alert_status,
alert['alert_tags'][tag_count]))
return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
def check_kairosdb_alert(
alert_config,
service_config,
logger,
production_mode=True):
from library.prom_api import PromAPI
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
availability = False
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due Grafana limitations
if 'availability' in alert_config and alert_config['availability']:
availability = True
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
query_url = os.path.join(
service_config['kairosdb_url'] +
"api/v1/datapoints/query")
ret = requests.post(
query_url,
data=json.dumps(
alert_config['query']),
timeout=service_config['timeout'])
assert ret.status_code == 200
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret.json()['queries'][0]['results']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r)
if has_custom_alert_routing(alert_config) else None)
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
minvalue = min([x[1] for x in r['values']])
maxvalue = max([x[1] for x in r['values']])
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
minvalue,
maxvalue,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
# A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
alert_list = [x for x in alert_list if not x is None]
set_firing(alert_config['id'], alert_list)
clear_suppressed(alert_config, [alert[3] for alert in alert_list])
for alert in alert_list :
if production_mode or True:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert for: {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error("Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"KairsoDB query failed: {}\n"
"HTTP status code:\t{}\n"
"Error Message:\t{}\nQuery:\n"
"{}".format(
ret.url,
ret.status_code,
ret.text,
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except SuppressedException as e :
logger.warn(
"Skipping alert check {} as it's suppressed: {}".format(
alert_config['id'],
e
))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
def stringify_alert_tags(alert_tags) :
if not alert_tags :
return "-"
for i in alert_tags :
if i.lower() == "dc" or i.lower() == "datacenter" :
return str(alert_tags[i])
return "-"
def is_suppressed(
alert_config,
alert_tags) :
ret = False
for dependency in alert_config['resolvedDependencies'].getDependencies() :
if get_firing(dependency, alert_tags) :
ret = True
if not is_within_threshold(alert_config, alert_tags) :
ret = False
print("is_suppressed(", alert_config['id'], alert_tags, ") =", ret)
return ret
def get_firing(id, alert_tags) :
firing = "{}\\{}".format(id, stringify_alert_tags(alert_tags))
try :
redis = get_redis_client()
status = redis.get(firing)
if status and "decode" in dir(status) :
status = status.decode()
status = status if status else "ok"
return status != "ok"
except Exception as e :
print(e)
return False
def get_redis_client() :
try :
redis = redis.Redis()
return redis
except Exception :
return MockRedis()
class MockRedis():
def __init__(self) :
return
def get(self, key) :
return None
def set(self, key, value) :
return
def delete(self, key) :
return
def call(self, *args, **kwargs) :
return []
def set_firing(id, active_fires) :
prefix = "{}\\".format(id)
previously_firing = list_firing(prefix)
should_fire = []
for active_fire in active_fires :
alert_tags = active_fire[3]
key = "{}{}".format(prefix, stringify_alert_tags(alert_tags))
should_fire.append(key)
for i in previously_firing :
if not i in should_fire :
set_not_firing(i)
for i in should_fire :
if not i in previously_firing :
set_is_firing(i)
def list_firing(prefix) :
try :
redis = get_redis_client()
resp = redis.call("KEYS", prefix+"*")
return resp
except Exception as e :
return []
def set_not_firing(id) :
redis = get_redis_client()
redis.delete(id)
def set_is_firing(id) :
redis = get_redis_client()
redis.set(id, "bad")
def is_within_threshold(alert_config, alert_tags) :
count = inc_suppressed(alert_config, alert_tags)
threshold = alert_config['suppressed_occurrences_threshold'] if 'suppressed_occurrences_threshold' in alert_config else 9000000000000
return count < threshold
def inc_suppressed(alert_config, alert_tags) :
key = stringify_alert_tags(alert_tags)
if not 'suppressed_occurrences' in alert_config :
alert_config['suppressed_occurrences'] = {}
if not key in alert_config['suppressed_occurrences'] :
alert_config['suppressed_occurrences'][key] = 0
alert_config['suppressed_occurrences'][key] += 1
return alert_config['suppressed_occurrences'][key]
def clear_suppressed(alert_config, all_alert_tags) :
for alert_tags in all_alert_tags:
key = stringify_alert_tags(alert_tags)
if not 'suppressed_occurrences' in alert_config :
continue
if not key in alert_config['suppressed_occurrences'] :
continue
del(alert_config['suppressed_occurrences'][key])
def check_prometheus_alert(
alert_config,
service_config,
logger,
production_mode=True):
from library.prom_api import PromAPI
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
num_dependencies = len(alert_config['resolvedDependencies'].getDependencies())
wait_time = random.randint(num_dependencies, alert_config['interval'] + num_dependencies)
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due to Grafana limitations
availability = bool(alert_config.get('availability'))
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
ret = prom_api.query_range(
query=alert_config['query'],
start=alert_config['start_time'],
end=alert_config['end_time'],
duration=alert_config['interval'])
assert ret['status'] == 'success'
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret['data']['result']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r) if
has_custom_alert_routing(alert_config) else None)
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
r['tags'] = {key: [value]
for (key, value) in r['metric'].items()}
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
raw_values = [value for _, value in r['values']]
min_value = float(min(raw_values))
max_value = float(max(raw_values))
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
min_value,
max_value,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
alert_list = [x for x in alert_list if not x is None]
set_firing(alert_config['id'], alert_list)
clear_suppressed(alert_config, [alert[3] for alert in alert_list])
for alert in alert_list :
if production_mode or True:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error(
"Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"Prometheus query failed:\n"
"Status:\t{}\n"
"Error Type:\t{}\n"
"Error Message:\t{}\n"
"Query:\n{}".format(
ret['status'],
ret['errorType'],
ret['error'],
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
def log_alert_results(results, alert_config, logger):
"""
Logs the results broken out by tag provided in the alert_config to the
logger for debuging
Args:
results: the results object returned from the call to kairosdb, of just
the results
alert_config: config object of the alert
logger (log object): does the logging
Returns:
None, logs to logger
"""
for v in results:
logger.debug("{} - Result: {}".format(alert_config['id'], v))
def send_alerts(
alert,
alert_config,
victorops_url,
slack_url,
slack_token,
smtp_server,
sensu_endpoint,
uchiwa_url,
logger):
"""
Sends out the alerts to VO, Email, and/or Slack
Args:
alert: the alert tuple:
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
alert_config: the alert configuration object
victorops_url: url to victorops
slack_url: url to slack api calls
slack_token: the token for the alert
smtp_server: The server to send mail messages too
sensu_endpoint:
uchiwa_url:
logger (log object): does the logging
Returns: None
"""
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
# USED
tag_dict = dict()
tag_dict['alert'] = alert_config['id']
is_custom_alert_routing = has_custom_alert_routing(alert_config)
if is_custom_alert_routing:
alert_routing = alert_config.get('alert_routing_lookup', {})
alert_config['alerts'] = alert_routing.get(
alert[3], alert_config['alerts']['lookup']['default'])
# once we move all alerts into sensu, we dont need to tho this
if 'filters' in alert_config:
logger.info(
"alert_status : {}, alert_config: {}".format(
alert[2], alert_config))
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
1, 2) and alert_config['filters']['slack_subdue']:
# unless the alert is critical we dont send it
logger.info("Removed slack, alert_config: {}".format(alert_config))
alert_config['alerts'].pop('slack', None)
if ('victorops_subdue' in alert_config['filters'] and
alert[2] in (1, 2) and
alert_config['filters']['victorops_subdue']):
# unless the alert is critical we dont send it
alert_config['alerts'].pop('vo', None)
logger.info("Removed vo, alert_config: {}".format(alert_config))
# ====================
# VICTOROPS HANDLING
# ====================
if 'vo' in alert_config['alerts'] and not is_suppressed(alert_config, alert[3]) :
for notify in alert_config['alerts']['vo']:
payload = dict(entity_id=alert[0],
message_type=alert_status[alert[2]],
state_message=alert[1])
r = None
try:
r = requests.post(
victorops_url + notify,
data=json.dumps(payload),
headers={
"Content-type": "application-json"})
assert r.status_code == 200
# Record a VO alert sent event
tag_dict['alert_channel_type'] = "VictorOps"
tag_dict['who'] = "vo:{}".format(notify)
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except AssertionError:
logger.error(
"Post to VO failed for {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to VO: {}".format(
alert_config['id'], str(e)))
# ====================
# EMAIL HANDLING
# ====================
if 'email' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
msg = MIMEText(alert[1])
msg['Subject'] = '{} Status: {}'.format(
alert[0], alert_status[alert[2]])
msg['From'] = 'aom@qualtrics.com'
msg['To'] = ','.join(
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
try:
s = smtplib.SMTP(smtp_server)
s.send_message(msg)
s.quit()
# Record an Email alert sent event
tag_dict['alert_channel_type'] = "Email"
tag_dict['who'] = "email:{}".format(msg['To'])
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except Exception as e:
logger.error(
"Unhandled exception when sending mail for {} to {}\n{}".format(
alert_config['id'], smtp_server, str(e)))
# ====================
# SENSU HANDLING
# ====================
if 'sensu' in alert_config['alerts']:
# Dictionary with static values for Sensu
sensu_dict = {
'source': 'AOM',
'refresh': 3600,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4]}
# if alert[3]:
# logger.info(alert)
# sensu_dict['name'] = '_'.join(
# [alert_config['id']] + sorted(list(alert[3])))
if 'refresh' in alert_config:
sensu_dict['refresh'] = alert_config['refresh']
sensu_dict['interval'] = alert_config['interval']
sensu_dict['handlers'] = []
sensu_dict['dashboard'] = alert_config['url']
if 'dependencies' in alert_config['alerts']['sensu'].keys():
sensu_dict['dependencies'] = (alert_config['alerts']
['sensu']['dependencies'])
if 'victorops' in alert_config['alerts']['sensu'].keys() and not is_suppressed(alert_config, alert[3]) :
sensu_dict['handlers'].append("victorops")
sensu_dict['routing_key'] = (alert_config['alerts']
['sensu']['victorops'])
# # Leave this here until we have email support in Sensu
# if 'email' in alert_config['alerts']['sensu'].keys():
# sensu_dict['handlers'].append("email")
# # verify this option
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
if 'slack' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("slack")
sensu_dict['slack_channel'] = (
alert_config['alerts']['sensu']['slack'])
# Format alert message
sensu_dict['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
if 'jira' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("jira")
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
if 'filters' in alert_config:
sensu_dict['filters'] = alert_config['filters']
# 0 = OK, 1 = WARNING, 2 = CRITICAL
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict['status'] = sensu_status[alert[2]]
sensu_dict['output'] = alert[1]
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'],
r.status_code,
r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to Sensu: {}".format(
alert_config['id'], str(e)))
# ====================
# SLACK HANDLING - all Slack alerts will go through Sensu
# ====================
if 'slack' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
refresh = alert_config.get('refresh', 3600)
dashboard = alert_config.get('url', '')
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict2 = {'handlers': ['slack'],
'interval': alert_config['interval'],
'source': 'AOM',
'refresh': refresh,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4],
'dashboard': dashboard,
'status': sensu_status[alert[2]],
'output': alert[1]}
if is_custom_alert_routing:
sensu_dict2['name'] = '_'.join(
[alert_config['id']] + list(alert[3]))
sensu_dict2['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
for channel in alert_config['alerts']['slack']:
sensu_dict2['slack_channel'] = channel
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict2),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} when posting"
"to Sensu: {}".format(alert_config['id'], str(e)))
# payload = dict(token=slack_token, channel=channel,
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
# r = None
# try:
# r = requests.post(slack_url, data=payload)
# assert r.status_code == 200
# # Record an Slack alert sent event
# tag_dict['alert_channel_type'] = "Slack"
# tag_dict['who'] = "slack:{}".format(channel)
# send_stat("alert_channel", 1, tag_dict)
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
# except AssertionError:
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
# except Exception as e:
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
# str(e)))
def send_metrics(alert, value, result, gaugename='stats'):
"""
Sends the results from the alert check to statsd
Args:
alert: The Alert config object that holds the alert['tag'] value.
gaugename: The name of the gauge metric we send.
value: The value we want to send as a gauge.
result: The result object from making the call. Use the data in this
object to tag the metric.
Returns: None
"""
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
# SPECIFIC ALERTS
result_tags = list(itertools.chain(
*[result['tags'][x] for x in alert['tags']]))
tag_dict = dict()
for x in range(len(alert['tags'])):
tag_dict[alert['tags'][x]] = result_tags[x]
tag_dict['alert'] = alert['id']
# SEND THE METRIC
send_stat(gaugename, value, tag_dict)
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
"""Sends stats value to statsd"""
client = StatsClient('telegraf', 8125, statprefix)
# SUBMIT STATS
client.gauge(gaugename, value, tags=tag_dict)
def has_custom_alert_routing(alert_config):
"""Checks if alert has custom routing"""
return 'lookup' in alert_config['alerts']
def get_alert_tags(alert_config, query_result):
"""Retrieves custom tags from alert"""
query_tags = {}
for tag in alert_config['alerts']['lookup']['tags']:
if (alert_config.get('query_type') == 'prometheus' and
'metric' in query_result and
tag in query_result['metric']):
query_tags[tag] = query_result['metric'][tag]
elif ('tags' in query_result and tag in query_result['tags']
and query_result['tags'][tag]):
query_tags[tag] = query_result['metric'][tag][0]
return query_tags
class SuppressedException(Exception) :
pass