QVolution2019.2/AoM_Service/library/serviceapp/service.py

950 lines
42 KiB
Python
Executable File

""" Alert On Metrics functions"""
import copy
import itertools
import json
import os
import random
import smtplib
from email.mime.text import MIMEText
from socket import gaierror
from time import sleep
from hashlib import md5
import requests
from statsd import StatsClient
from serviceapp.prom_api import PromAPI
alert_status = [
'RECOVERY',
'WARNING',
'WARNING',
'CRITICAL',
'CRITICAL',
'CRITICAL']
def build_alert_message(alert, minvalue, maxvalue, result, logger,
availability, tag=None, alert_tags=None):
"""
Build the alert message
Args:
alert: the alert object that includes a tag definition
minvalue: the min value to test against the threshold
maxvalue: the max value to test against the threshold
result: the response back from kairosdb
logger (log object): does the logging
availability: Send availability stat 1
tag: If passed in will use this value for the tag instead of
getting it from the result object
alert_tags: the tags corresponding to the result, used if an
alert has to be triggered and a custom routing per tag is configured
Returns:
Alert message string
"""
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
# MAY CHANGE THIS.
# value = maxvalue
# # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
# # (USUALLY A GLOBAL ALL-DC QUERY)
# if tag is None and result is not None:
# tag = ', '.join(sorted(list(itertools.chain(
# *[result['tags'][x] for x in alert['tags']]))))
# tag_count = tag + "_count"
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
# RETURNING RESULTS
# tag_noresult = tag + "_noresult"
# if not tag:
# tag = 'instance'
# logger.debug("No tag specified for alert {}".format(alert['id']))
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
# if 'alert_tags' not in alert:
# alert['alert_tags'] = {}
# if tag not in alert['alert_tags']:
# alert['alert_tags'][tag] = 0
# if tag_count not in alert['alert_tags']:
# alert['alert_tags'][tag_count] = 0
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
# CLEARING EVERYTHING OUT ANYWAY
# alert['alert_tags'][tag_noresult] = 0
# # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
# upper_critical_threshold = None
# upper_warning_threshold = None
# lower_warning_threshold = None
# lower_critical_threshold = None
# upper_threshold = None
# lower_threshold = None
# is_warning_alarm = False
# is_critical_alarm = False
# # UPPER
# upper_threshold_exists = False
# upper_warning_threshold_breached = False
# upper_critical_threshold_breached = False
# if 'warning_upper_threshold' in alert:
# upper_threshold_exists = True
# upper_warning_threshold = alert['warning_upper_threshold']
# upper_threshold = upper_warning_threshold
# if maxvalue >= upper_warning_threshold:
# upper_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_upper_threshold' in alert:
# upper_critical_threshold = alert['critical_upper_threshold']
# if not upper_threshold_exists:
# upper_threshold = upper_critical_threshold
# upper_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
# # OUR THRESHOLD FOR ALERTING
# if maxvalue >= alert['critical_upper_threshold']:
# upper_threshold = upper_critical_threshold
# upper_critical_threshold_breached = True
# is_critical_alarm = True
# upper_threshold_breached = (upper_warning_threshold_breached
# or upper_critical_threshold_breached)
# # LOWER
# lower_threshold_exists = False
# lower_warning_threshold_breached = False
# lower_critical_threshold_breached = False
# if 'warning_lower_threshold' in alert:
# lower_threshold_exists = True
# lower_warning_threshold = alert['warning_lower_threshold']
# lower_threshold = lower_warning_threshold
# if minvalue <= lower_warning_threshold:
# lower_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_lower_threshold' in alert:
# lower_critical_threshold = alert['critical_lower_threshold']
# if not lower_threshold_exists:
# lower_threshold = lower_critical_threshold
# lower_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
# # OUR THRESHOLD FOR ALERTING
# if minvalue <= lower_critical_threshold:
# lower_threshold = lower_critical_threshold
# lower_critical_threshold_breached = True
# is_critical_alarm = True
# lower_threshold_breached = (lower_warning_threshold_breached or
# lower_critical_threshold_breached)
# # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
# if lower_threshold is None and upper_threshold is None:
# logger.debug(
# "ERROR: alert {} does not have any thresholds set on {}".format(
# alert['id'], tag))
# # ON TO OCCURRENCES
# if 'occurrences_threshold' in alert:
# occurrences_threshold = alert['occurrences_threshold']
# else:
# occurrences_threshold = 1
# alert_entity = "Metric: {} for {}".format(alert['id'], tag)
# if 'url' not in alert:
# alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
# ====================
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
# ====================
# alert_body = ''
# if upper_threshold_breached:
# alert_body = "{}\n{:.2f} >= {}\n{}".format(
# alert_entity, value, upper_threshold, alert['url'])
# if lower_threshold_breached:
# value = minvalue
# alert_body = "{}\n{:.2f} <= {}\n{}".format(
# alert_entity, value, lower_threshold, alert['url'])
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
### BREEL TODO ###
# if result is not None:
# send_metrics(alert, value, result)
# if 'critical_upper_threshold' in alert:
# send_stat('upper_critical_threshold', upper_critical_threshold,
# {'id': alert['id']})
# if 'warning_upper_threshold' in alert:
# send_stat('upper_warning_threshold', upper_warning_threshold,
# {'id': alert['id']})
# if 'critical_lower_threshold' in alert:
# send_stat('lower_critical_threshold', lower_critical_threshold,
# {'id': alert['id']})
# if 'warning_lower_threshold' in alert:
# send_stat('lower_warning_threshold', lower_warning_threshold,
# {'id': alert['id']})
# ====================
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
# ====================
#current_alert_status = alert_status[0]
#if not lower_threshold_breached and not upper_threshold_breached:
# # if result is not None:
# # if lower_threshold_exists and not upper_threshold_exists:
# # alert_body = "{}\n{:.2f} > {}\n{}".format(
# # alert_entity, value, lower_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is higher than lower threshold {}"
# # "for value {} on tag {}".format(
# # alert['id'], lower_threshold, value, tag))
# # if upper_threshold_exists and not lower_threshold_exists:
# # alert_body = "{}\n{:.2f} < {}\n{}".format(
# # alert_entity, value, upper_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is below the upper threshold {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, value, tag))
# # if upper_threshold_exists and lower_threshold_exists:
# # alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
# # alert_entity, lower_threshold, value, upper_threshold,
# # alert['url'])
# # logger.debug("GOOD: alert {} is between thresholds {} and {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, lower_threshold,
# # value, tag))
# # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
# # STATE
# #if alert['alert_tags'][tag] > 0:
# # if result is not None:
# # send_metrics(alert, 1, result, current_alert_status)
# # logger.info(
# # "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
# # alert['id'], tag))
# # if result is None:
# # alert_body = ("{} RECOVERY due to no results found from "
# # "KairosDB query. Recommend you manually validate"
# # "recovery.\n{}").format(
# # alert_entity, alert['url'])
# # alert['alert_tags'][tag] = 0
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# #else:
# # # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # # CRITICAL) NEEDS TO BE FIRED
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# # return None
#else:
### BREEL WORKING HERE ###
# ====================
# SET KEY / VALUE FOR TAG ON ALERT
# 0 == No Alert
# 1 == Warning
# 2 == Existing Warning Alert
# 3 == New Critical
# 4+ == Existing Critical Alert
# ====================
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
# alert['alert_tags'][tag_count] += 1
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
# OCCURRENCES SO RETURN IT
# TODO this doesnt belog in Alert.py
#if alert['alert_tags'][tag_count] >= occurrences_threshold:
# # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
# if alert['alert_tags'][tag] < 4:
# if is_warning_alarm and not is_critical_alarm:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
# if alert['alert_tags'][tag] == 0:
# # NEW WARNING
# alert['alert_tags'][tag] = 1
# logger.info("TestInfo: WARNING (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING WARNING
# alert['alert_tags'][tag] = 2
# logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
# alert['id'], tag))
# if is_critical_alarm:
# # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
# if (alert['alert_tags'][tag] == 1 or
# alert['alert_tags'][tag] == 2):
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
# alert['id'], tag))
# else:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
# # LEVEL
# if alert['alert_tags'][tag] < 3:
# # NEW CRITICAL
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING CRITICAL
# alert['alert_tags'][tag] = 4
# logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
# alert['id'], tag))
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
# EVEN IF NOT ACTIVELY ALERTING ON IT
# #if is_critical_alarm:
# #current_alert_status = alert_status[3]
# #send_metrics(alert, 2, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 0")
# # send_metrics(alert, 0, result, 'service_level')
# #if is_warning_alarm and not is_critical_alarm:
# #current_alert_status = alert_status[1]
# #send_metrics(alert, 1, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# logger.debug("{} alert for value {} of {} for tag {} has occurred "
# "{} times. Threshold is >= {} times.".format(
# current_alert_status,
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# else:
# # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # CRITICAL) NEEDS TO BE FIRED
# logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
# "threshold of {}".format(
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# if availability:
# logger.info("Sending availability stat")
# send_metrics(alert, 1, result, 'service_level')
# return None
#logger.debug(
# "Alert {}->[{}]->{}, Occurrences={}".format(
# alert['id'], tag, current_alert_status,
# alert['alert_tags'][tag_count]))
#return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
def check_kairosdb_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
availability = False
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due Grafana limitations
if 'availability' in alert_config and alert_config['availability']:
availability = True
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
query_url = os.path.join(
service_config['kairosdb_url'] +
"api/v1/datapoints/query")
ret = requests.post(
query_url,
data=json.dumps(
alert_config['query']),
timeout=service_config['timeout'])
assert ret.status_code == 200
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret.json()['queries'][0]['results']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r)
if has_custom_alert_routing(alert_config) else None)
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
minvalue = min([x[1] for x in r['values']])
maxvalue = max([x[1] for x in r['values']])
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
minvalue,
maxvalue,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
# A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert for: {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error("Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"KairsoDB query failed: {}\n"
"HTTP status code:\t{}\n"
"Error Message:\t{}\nQuery:\n"
"{}".format(
ret.url,
ret.status_code,
ret.text,
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
def check_prometheus_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due to Grafana limitations
availability = bool(alert_config.get('availability'))
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
ret = prom_api.query_range(
query=alert_config['query'],
start=alert_config['start_time'],
end=alert_config['end_time'],
duration=alert_config['interval'])
assert ret['status'] == 'success'
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret['data']['result']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r) if
has_custom_alert_routing(alert_config) else None)
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
r['tags'] = {key: [value]
for (key, value) in r['metric'].items()}
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
raw_values = [value for _, value in r['values']]
min_value = float(min(raw_values))
max_value = float(max(raw_values))
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
min_value,
max_value,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error(
"Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"Prometheus query failed:\n"
"Status:\t{}\n"
"Error Type:\t{}\n"
"Error Message:\t{}\n"
"Query:\n{}".format(
ret['status'],
ret['errorType'],
ret['error'],
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
def log_alert_results(results, alert_config, logger):
"""
Logs the results broken out by tag provided in the alert_config to the
logger for debuging
Args:
results: the results object returned from the call to kairosdb, of just
the results
alert_config: config object of the alert
logger (log object): does the logging
Returns:
None, logs to logger
"""
for v in results:
logger.debug("{} - Result: {}".format(alert_config['id'], v))
def send_alerts(
alert,
alert_config,
victorops_url,
slack_url,
slack_token,
smtp_server,
sensu_endpoint,
uchiwa_url,
logger):
"""
Sends out the alerts to VO, Email, and/or Slack
Args:
alert: the alert tuple:
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
alert_config: the alert configuration object
victorops_url: url to victorops
slack_url: url to slack api calls
slack_token: the token for the alert
smtp_server: The server to send mail messages too
sensu_endpoint:
uchiwa_url:
logger (log object): does the logging
Returns: None
"""
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
# USED
tag_dict = dict()
tag_dict['alert'] = alert_config['id']
is_custom_alert_routing = has_custom_alert_routing(alert_config)
if is_custom_alert_routing:
alert_routing = alert_config.get('alert_routing_lookup', {})
alert_config['alerts'] = alert_routing.get(
alert[3], alert_config['alerts']['lookup']['default'])
# once we move all alerts into sensu, we dont need to tho this
if 'filters' in alert_config:
logger.info(
"alert_status : {}, alert_config: {}".format(
alert[2], alert_config))
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
1, 2) and alert_config['filters']['slack_subdue']:
# unless the alert is critical we dont send it
logger.info("Removed slack, alert_config: {}".format(alert_config))
alert_config['alerts'].pop('slack', None)
if ('victorops_subdue' in alert_config['filters'] and
alert[2] in (1, 2) and
alert_config['filters']['victorops_subdue']):
# unless the alert is critical we dont send it
alert_config['alerts'].pop('vo', None)
logger.info("Removed vo, alert_config: {}".format(alert_config))
# ====================
# VICTOROPS HANDLING
# ====================
if 'vo' in alert_config['alerts']:
for notify in alert_config['alerts']['vo']:
payload = dict(entity_id=alert[0],
message_type=alert_status[alert[2]],
state_message=alert[1])
r = None
try:
r = requests.post(
victorops_url + notify,
data=json.dumps(payload),
headers={
"Content-type": "application-json"})
assert r.status_code == 200
# Record a VO alert sent event
tag_dict['alert_channel_type'] = "VictorOps"
tag_dict['who'] = "vo:{}".format(notify)
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except AssertionError:
logger.error(
"Post to VO failed for {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to VO: {}".format(
alert_config['id'], str(e)))
# ====================
# EMAIL HANDLING
# ====================
if 'email' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
msg = MIMEText(alert[1])
msg['Subject'] = '{} Status: {}'.format(
alert[0], alert_status[alert[2]])
msg['From'] = 'aom@qualtrics.com'
msg['To'] = ','.join(
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
try:
s = smtplib.SMTP(smtp_server)
s.send_message(msg)
s.quit()
# Record an Email alert sent event
tag_dict['alert_channel_type'] = "Email"
tag_dict['who'] = "email:{}".format(msg['To'])
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except Exception as e:
logger.error(
"Unhandled exception when sending mail for {} to {}\n{}".format(
alert_config['id'], smtp_server, str(e)))
# ====================
# SENSU HANDLING
# ====================
if 'sensu' in alert_config['alerts']:
# Dictionary with static values for Sensu
sensu_dict = {
'source': 'AOM',
'refresh': 3600,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4]}
# if alert[3]:
# logger.info(alert)
# sensu_dict['name'] = '_'.join(
# [alert_config['id']] + sorted(list(alert[3])))
if 'refresh' in alert_config:
sensu_dict['refresh'] = alert_config['refresh']
sensu_dict['interval'] = alert_config['interval']
sensu_dict['handlers'] = []
sensu_dict['dashboard'] = alert_config['url']
if 'dependencies' in alert_config['alerts']['sensu'].keys():
sensu_dict['dependencies'] = (alert_config['alerts']
['sensu']['dependencies'])
if 'victorops' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("victorops")
sensu_dict['routing_key'] = (alert_config['alerts']
['sensu']['victorops'])
# # Leave this here until we have email support in Sensu
# if 'email' in alert_config['alerts']['sensu'].keys():
# sensu_dict['handlers'].append("email")
# # verify this option
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
if 'slack' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("slack")
sensu_dict['slack_channel'] = (
alert_config['alerts']['sensu']['slack'])
# Format alert message
sensu_dict['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
if 'jira' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("jira")
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
if 'filters' in alert_config:
sensu_dict['filters'] = alert_config['filters']
# 0 = OK, 1 = WARNING, 2 = CRITICAL
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict['status'] = sensu_status[alert[2]]
sensu_dict['output'] = alert[1]
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'],
r.status_code,
r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to Sensu: {}".format(
alert_config['id'], str(e)))
# ====================
# SLACK HANDLING - all Slack alerts will go through Sensu
# ====================
if 'slack' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
refresh = alert_config.get('refresh', 3600)
dashboard = alert_config.get('url', '')
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict2 = {'handlers': ['slack'],
'interval': alert_config['interval'],
'source': 'AOM',
'refresh': refresh,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4],
'dashboard': dashboard,
'status': sensu_status[alert[2]],
'output': alert[1]}
if is_custom_alert_routing:
sensu_dict2['name'] = '_'.join(
[alert_config['id']] + list(alert[3]))
sensu_dict2['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
for channel in alert_config['alerts']['slack']:
sensu_dict2['slack_channel'] = channel
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict2),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} when posting"
"to Sensu: {}".format(alert_config['id'], str(e)))
# payload = dict(token=slack_token, channel=channel,
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
# r = None
# try:
# r = requests.post(slack_url, data=payload)
# assert r.status_code == 200
# # Record an Slack alert sent event
# tag_dict['alert_channel_type'] = "Slack"
# tag_dict['who'] = "slack:{}".format(channel)
# send_stat("alert_channel", 1, tag_dict)
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
# except AssertionError:
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
# except Exception as e:
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
# str(e)))
def send_metrics(alert, value, result, gaugename='stats'):
"""
Sends the results from the alert check to statsd
Args:
alert: The Alert config object that holds the alert['tag'] value.
gaugename: The name of the gauge metric we send.
value: The value we want to send as a gauge.
result: The result object from making the call. Use the data in this
object to tag the metric.
Returns: None
"""
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
# SPECIFIC ALERTS
result_tags = list(itertools.chain(
*[result['tags'][x] for x in alert['tags']]))
tag_dict = dict()
for x in range(len(alert['tags'])):
tag_dict[alert['tags'][x]] = result_tags[x]
tag_dict['alert'] = alert['id']
# SEND THE METRIC
send_stat(gaugename, value, tag_dict)
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
"""Sends stats value to statsd"""
client = StatsClient('telegraf', 8125, statprefix)
# SUBMIT STATS
client.gauge(gaugename, value, tags=tag_dict)
def has_custom_alert_routing(alert_config):
"""Checks if alert has custom routing"""
return 'lookup' in alert_config['alerts']
def get_alert_tags(alert_config, query_result):
"""Retrieves custom tags from alert"""
query_tags = []
for tag in alert_config['alerts']['lookup']['tags']:
if (alert_config.get('query_type') == 'prometheus' and
'metric' in query_result and
tag in query_result['metric']):
query_tags.append(query_result['metric'][tag])
elif ('tags' in query_result and tag in query_result['tags']
and query_result['tags'][tag]):
query_tags.append(query_result['tags'][tag][0])
return tuple(query_tags)