950 lines
42 KiB
Python
Executable File
950 lines
42 KiB
Python
Executable File
""" Alert On Metrics functions"""
|
|
|
|
import copy
|
|
import itertools
|
|
import json
|
|
import os
|
|
import random
|
|
import smtplib
|
|
from email.mime.text import MIMEText
|
|
from socket import gaierror
|
|
from time import sleep
|
|
from hashlib import md5
|
|
import requests
|
|
from statsd import StatsClient
|
|
from serviceapp.prom_api import PromAPI
|
|
|
|
alert_status = [
|
|
'RECOVERY',
|
|
'WARNING',
|
|
'WARNING',
|
|
'CRITICAL',
|
|
'CRITICAL',
|
|
'CRITICAL']
|
|
|
|
|
|
def build_alert_message(alert, minvalue, maxvalue, result, logger,
|
|
availability, tag=None, alert_tags=None):
|
|
"""
|
|
Build the alert message
|
|
Args:
|
|
alert: the alert object that includes a tag definition
|
|
minvalue: the min value to test against the threshold
|
|
maxvalue: the max value to test against the threshold
|
|
result: the response back from kairosdb
|
|
logger (log object): does the logging
|
|
availability: Send availability stat 1
|
|
tag: If passed in will use this value for the tag instead of
|
|
getting it from the result object
|
|
alert_tags: the tags corresponding to the result, used if an
|
|
alert has to be triggered and a custom routing per tag is configured
|
|
Returns:
|
|
Alert message string
|
|
"""
|
|
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
|
|
# MAY CHANGE THIS.
|
|
# value = maxvalue
|
|
# # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
|
|
# # (USUALLY A GLOBAL ALL-DC QUERY)
|
|
# if tag is None and result is not None:
|
|
# tag = ', '.join(sorted(list(itertools.chain(
|
|
# *[result['tags'][x] for x in alert['tags']]))))
|
|
# tag_count = tag + "_count"
|
|
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
|
|
# RETURNING RESULTS
|
|
# tag_noresult = tag + "_noresult"
|
|
# if not tag:
|
|
# tag = 'instance'
|
|
# logger.debug("No tag specified for alert {}".format(alert['id']))
|
|
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
|
|
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
|
|
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
|
|
# if 'alert_tags' not in alert:
|
|
# alert['alert_tags'] = {}
|
|
# if tag not in alert['alert_tags']:
|
|
# alert['alert_tags'][tag] = 0
|
|
# if tag_count not in alert['alert_tags']:
|
|
# alert['alert_tags'][tag_count] = 0
|
|
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
|
|
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
|
|
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
|
|
# CLEARING EVERYTHING OUT ANYWAY
|
|
# alert['alert_tags'][tag_noresult] = 0
|
|
|
|
# # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
|
|
# upper_critical_threshold = None
|
|
# upper_warning_threshold = None
|
|
# lower_warning_threshold = None
|
|
# lower_critical_threshold = None
|
|
# upper_threshold = None
|
|
# lower_threshold = None
|
|
# is_warning_alarm = False
|
|
# is_critical_alarm = False
|
|
|
|
# # UPPER
|
|
# upper_threshold_exists = False
|
|
# upper_warning_threshold_breached = False
|
|
# upper_critical_threshold_breached = False
|
|
# if 'warning_upper_threshold' in alert:
|
|
# upper_threshold_exists = True
|
|
# upper_warning_threshold = alert['warning_upper_threshold']
|
|
# upper_threshold = upper_warning_threshold
|
|
# if maxvalue >= upper_warning_threshold:
|
|
# upper_warning_threshold_breached = True
|
|
# is_warning_alarm = True
|
|
# if 'critical_upper_threshold' in alert:
|
|
# upper_critical_threshold = alert['critical_upper_threshold']
|
|
# if not upper_threshold_exists:
|
|
# upper_threshold = upper_critical_threshold
|
|
# upper_threshold_exists = True
|
|
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
|
|
# # OUR THRESHOLD FOR ALERTING
|
|
# if maxvalue >= alert['critical_upper_threshold']:
|
|
# upper_threshold = upper_critical_threshold
|
|
# upper_critical_threshold_breached = True
|
|
# is_critical_alarm = True
|
|
# upper_threshold_breached = (upper_warning_threshold_breached
|
|
# or upper_critical_threshold_breached)
|
|
|
|
# # LOWER
|
|
# lower_threshold_exists = False
|
|
# lower_warning_threshold_breached = False
|
|
# lower_critical_threshold_breached = False
|
|
# if 'warning_lower_threshold' in alert:
|
|
# lower_threshold_exists = True
|
|
# lower_warning_threshold = alert['warning_lower_threshold']
|
|
# lower_threshold = lower_warning_threshold
|
|
# if minvalue <= lower_warning_threshold:
|
|
# lower_warning_threshold_breached = True
|
|
# is_warning_alarm = True
|
|
# if 'critical_lower_threshold' in alert:
|
|
# lower_critical_threshold = alert['critical_lower_threshold']
|
|
# if not lower_threshold_exists:
|
|
# lower_threshold = lower_critical_threshold
|
|
# lower_threshold_exists = True
|
|
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
|
|
# # OUR THRESHOLD FOR ALERTING
|
|
# if minvalue <= lower_critical_threshold:
|
|
# lower_threshold = lower_critical_threshold
|
|
# lower_critical_threshold_breached = True
|
|
# is_critical_alarm = True
|
|
# lower_threshold_breached = (lower_warning_threshold_breached or
|
|
# lower_critical_threshold_breached)
|
|
|
|
# # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
|
|
# if lower_threshold is None and upper_threshold is None:
|
|
# logger.debug(
|
|
# "ERROR: alert {} does not have any thresholds set on {}".format(
|
|
# alert['id'], tag))
|
|
|
|
# # ON TO OCCURRENCES
|
|
# if 'occurrences_threshold' in alert:
|
|
# occurrences_threshold = alert['occurrences_threshold']
|
|
# else:
|
|
# occurrences_threshold = 1
|
|
|
|
# alert_entity = "Metric: {} for {}".format(alert['id'], tag)
|
|
|
|
# if 'url' not in alert:
|
|
# alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
|
|
|
|
# ====================
|
|
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
|
|
# ====================
|
|
# alert_body = ''
|
|
# if upper_threshold_breached:
|
|
# alert_body = "{}\n{:.2f} >= {}\n{}".format(
|
|
# alert_entity, value, upper_threshold, alert['url'])
|
|
# if lower_threshold_breached:
|
|
# value = minvalue
|
|
# alert_body = "{}\n{:.2f} <= {}\n{}".format(
|
|
# alert_entity, value, lower_threshold, alert['url'])
|
|
|
|
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
|
|
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
|
|
### BREEL TODO ###
|
|
# if result is not None:
|
|
# send_metrics(alert, value, result)
|
|
# if 'critical_upper_threshold' in alert:
|
|
# send_stat('upper_critical_threshold', upper_critical_threshold,
|
|
# {'id': alert['id']})
|
|
# if 'warning_upper_threshold' in alert:
|
|
# send_stat('upper_warning_threshold', upper_warning_threshold,
|
|
# {'id': alert['id']})
|
|
# if 'critical_lower_threshold' in alert:
|
|
# send_stat('lower_critical_threshold', lower_critical_threshold,
|
|
# {'id': alert['id']})
|
|
# if 'warning_lower_threshold' in alert:
|
|
# send_stat('lower_warning_threshold', lower_warning_threshold,
|
|
# {'id': alert['id']})
|
|
|
|
# ====================
|
|
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
|
|
# ====================
|
|
#current_alert_status = alert_status[0]
|
|
#if not lower_threshold_breached and not upper_threshold_breached:
|
|
# # if result is not None:
|
|
# # if lower_threshold_exists and not upper_threshold_exists:
|
|
# # alert_body = "{}\n{:.2f} > {}\n{}".format(
|
|
# # alert_entity, value, lower_threshold, alert['url'])
|
|
# # logger.debug("GOOD: alert {} is higher than lower threshold {}"
|
|
# # "for value {} on tag {}".format(
|
|
# # alert['id'], lower_threshold, value, tag))
|
|
# # if upper_threshold_exists and not lower_threshold_exists:
|
|
# # alert_body = "{}\n{:.2f} < {}\n{}".format(
|
|
# # alert_entity, value, upper_threshold, alert['url'])
|
|
# # logger.debug("GOOD: alert {} is below the upper threshold {} "
|
|
# # "for value {} on tag {}".format(
|
|
# # alert['id'], upper_threshold, value, tag))
|
|
# # if upper_threshold_exists and lower_threshold_exists:
|
|
# # alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
|
|
# # alert_entity, lower_threshold, value, upper_threshold,
|
|
# # alert['url'])
|
|
# # logger.debug("GOOD: alert {} is between thresholds {} and {} "
|
|
# # "for value {} on tag {}".format(
|
|
# # alert['id'], upper_threshold, lower_threshold,
|
|
# # value, tag))
|
|
# # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
|
|
# # STATE
|
|
# #if alert['alert_tags'][tag] > 0:
|
|
# # if result is not None:
|
|
# # send_metrics(alert, 1, result, current_alert_status)
|
|
# # logger.info(
|
|
# # "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
|
|
# # alert['id'], tag))
|
|
# # if result is None:
|
|
# # alert_body = ("{} RECOVERY due to no results found from "
|
|
# # "KairosDB query. Recommend you manually validate"
|
|
# # "recovery.\n{}").format(
|
|
# # alert_entity, alert['url'])
|
|
# # alert['alert_tags'][tag] = 0
|
|
# # alert['alert_tags'][tag_count] = 0
|
|
# # if availability:
|
|
# # logger.info("Sending availability stat 1")
|
|
# # send_metrics(alert, 1, result, 'service_level')
|
|
# #else:
|
|
# # # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
|
|
# # # CRITICAL) NEEDS TO BE FIRED
|
|
# # alert['alert_tags'][tag_count] = 0
|
|
# # if availability:
|
|
# # logger.info("Sending availability stat 1")
|
|
# # send_metrics(alert, 1, result, 'service_level')
|
|
# # return None
|
|
#else:
|
|
### BREEL WORKING HERE ###
|
|
# ====================
|
|
# SET KEY / VALUE FOR TAG ON ALERT
|
|
# 0 == No Alert
|
|
# 1 == Warning
|
|
# 2 == Existing Warning Alert
|
|
# 3 == New Critical
|
|
# 4+ == Existing Critical Alert
|
|
# ====================
|
|
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
|
|
# alert['alert_tags'][tag_count] += 1
|
|
|
|
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
|
|
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
|
|
# OCCURRENCES SO RETURN IT
|
|
# TODO this doesnt belog in Alert.py
|
|
#if alert['alert_tags'][tag_count] >= occurrences_threshold:
|
|
# # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
|
|
# if alert['alert_tags'][tag] < 4:
|
|
# if is_warning_alarm and not is_critical_alarm:
|
|
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
|
|
# if alert['alert_tags'][tag] == 0:
|
|
# # NEW WARNING
|
|
# alert['alert_tags'][tag] = 1
|
|
# logger.info("TestInfo: WARNING (NEW): {} - {}".format(
|
|
# alert['id'], tag))
|
|
# else:
|
|
# # EXISTING WARNING
|
|
# alert['alert_tags'][tag] = 2
|
|
# logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
|
|
# alert['id'], tag))
|
|
# if is_critical_alarm:
|
|
# # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
|
|
# if (alert['alert_tags'][tag] == 1 or
|
|
# alert['alert_tags'][tag] == 2):
|
|
# alert['alert_tags'][tag] = 3
|
|
# logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
|
|
# alert['id'], tag))
|
|
# else:
|
|
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
|
|
# # LEVEL
|
|
# if alert['alert_tags'][tag] < 3:
|
|
# # NEW CRITICAL
|
|
# alert['alert_tags'][tag] = 3
|
|
# logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
|
|
# alert['id'], tag))
|
|
# else:
|
|
# # EXISTING CRITICAL
|
|
# alert['alert_tags'][tag] = 4
|
|
# logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
|
|
# alert['id'], tag))
|
|
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
|
|
# EVEN IF NOT ACTIVELY ALERTING ON IT
|
|
# #if is_critical_alarm:
|
|
# #current_alert_status = alert_status[3]
|
|
# #send_metrics(alert, 2, result, current_alert_status)
|
|
# #if availability:
|
|
# # logger.info("Sending availability stat 0")
|
|
# # send_metrics(alert, 0, result, 'service_level')
|
|
# #if is_warning_alarm and not is_critical_alarm:
|
|
# #current_alert_status = alert_status[1]
|
|
# #send_metrics(alert, 1, result, current_alert_status)
|
|
# #if availability:
|
|
# # logger.info("Sending availability stat 1")
|
|
# # send_metrics(alert, 1, result, 'service_level')
|
|
# logger.debug("{} alert for value {} of {} for tag {} has occurred "
|
|
# "{} times. Threshold is >= {} times.".format(
|
|
# current_alert_status,
|
|
# value,
|
|
# alert['id'],
|
|
# tag,
|
|
# alert['alert_tags'][tag_count],
|
|
# occurrences_threshold))
|
|
# else:
|
|
# # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
|
|
# # CRITICAL) NEEDS TO BE FIRED
|
|
# logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
|
|
# "threshold of {}".format(
|
|
# value,
|
|
# alert['id'],
|
|
# tag,
|
|
# alert['alert_tags'][tag_count],
|
|
# occurrences_threshold))
|
|
# if availability:
|
|
# logger.info("Sending availability stat")
|
|
# send_metrics(alert, 1, result, 'service_level')
|
|
# return None
|
|
|
|
#logger.debug(
|
|
# "Alert {}->[{}]->{}, Occurrences={}".format(
|
|
# alert['id'], tag, current_alert_status,
|
|
# alert['alert_tags'][tag_count]))
|
|
#return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
|
|
|
|
|
|
def check_kairosdb_alert(
|
|
alert_config,
|
|
service_config,
|
|
logger,
|
|
production_mode=True):
|
|
"""
|
|
Args:
|
|
alert_config (dict): Config of the alert to run
|
|
service_config (dict): Holds things like urls, tokens and other things
|
|
logger (log object): does the logging
|
|
Returns:
|
|
None
|
|
"""
|
|
availability = False
|
|
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
|
|
# START AT THE SAME TIME
|
|
wait_time = random.randint(0, alert_config['interval'])
|
|
logger.info(
|
|
"ALERT_CONFIG: {}\tsleep: {}".format(
|
|
alert_config['id'],
|
|
wait_time))
|
|
sleep(wait_time)
|
|
# For metrics with availability set to true, we default the interval to 5
|
|
# mins due Grafana limitations
|
|
if 'availability' in alert_config and alert_config['availability']:
|
|
availability = True
|
|
# ====================
|
|
# EACH CHECK JUST LOOPS
|
|
# ====================
|
|
ret = None
|
|
while True:
|
|
try:
|
|
send_stat("check_run", 1, {'id': alert_config['id']})
|
|
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
|
|
query_url = os.path.join(
|
|
service_config['kairosdb_url'] +
|
|
"api/v1/datapoints/query")
|
|
ret = requests.post(
|
|
query_url,
|
|
data=json.dumps(
|
|
alert_config['query']),
|
|
timeout=service_config['timeout'])
|
|
assert ret.status_code == 200
|
|
|
|
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
|
|
results = ret.json()['queries'][0]['results']
|
|
logger.debug(
|
|
"Got back {} results for alert {}".format(
|
|
len(results), alert_config['id']))
|
|
log_alert_results(results, alert_config, logger)
|
|
alert_list = []
|
|
|
|
# LOOP THROUGH ALL THE RESULTS
|
|
for r in results:
|
|
alert_tags = (get_alert_tags(alert_config, r)
|
|
if has_custom_alert_routing(alert_config) else None)
|
|
|
|
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
|
|
# THEREIN AND EXAMINE FOR FAILURE
|
|
if r['values']:
|
|
minvalue = min([x[1] for x in r['values']])
|
|
maxvalue = max([x[1] for x in r['values']])
|
|
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
|
|
# AN OBJECT
|
|
alert_list.append(
|
|
build_alert_message(
|
|
alert_config,
|
|
minvalue,
|
|
maxvalue,
|
|
r,
|
|
logger,
|
|
availability,
|
|
alert_tags=alert_tags))
|
|
|
|
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
|
|
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
|
|
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
|
|
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
|
|
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
|
|
# A NO-OP IF NO HISTORY.
|
|
elif 'alert_tags' in alert_config:
|
|
for key in alert_config['alert_tags']:
|
|
if ('count' not in key and 'noresult' not in key and
|
|
alert_config['alert_tags'][key] > 0):
|
|
key_noresult = key + "_noresult"
|
|
key_count = key + "_count"
|
|
if alert_config['alert_tags'][key_noresult] > 10:
|
|
logger.info("{} occurrences of no results back "
|
|
"for {}, clear out counts for tag '{}'".format(
|
|
alert_config['alert_tags'][key_noresult],
|
|
alert_config['id'], key))
|
|
alert_list.append(
|
|
build_alert_message(
|
|
alert_config,
|
|
0,
|
|
0,
|
|
None,
|
|
logger,
|
|
availability,
|
|
key,
|
|
alert_tags=alert_tags))
|
|
alert_config['alert_tags'][key] = 0
|
|
alert_config['alert_tags'][key_count] = 0
|
|
alert_config['alert_tags'][key_noresult] = 0
|
|
else:
|
|
alert_config['alert_tags'][key_noresult] += 1
|
|
logger.info("{} occurrences of no results back "
|
|
"for {}, tag '{}'".format(
|
|
alert_config['alert_tags'][key_noresult],
|
|
alert_config['id'], key))
|
|
|
|
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
|
|
for alert in [x for x in alert_list if x is not None]:
|
|
if production_mode:
|
|
send_alerts(
|
|
alert,
|
|
copy.deepcopy(alert_config),
|
|
service_config['victorops_url'],
|
|
service_config['slack_url'],
|
|
service_config['slack_token'],
|
|
service_config['smtp_server'],
|
|
service_config['sensu_endpoint'],
|
|
service_config['uchiwa_url'],
|
|
logger)
|
|
else:
|
|
logger.info(
|
|
"Sending alert for: {}".format(
|
|
alert_config.get('id')))
|
|
|
|
# HANDLE THE UNEXPECTED
|
|
except TimeoutError:
|
|
logger.error("Query [{}] took to long to run".format(
|
|
alert_config['id']))
|
|
except AssertionError:
|
|
logger.error(
|
|
"KairsoDB query failed: {}\n"
|
|
"HTTP status code:\t{}\n"
|
|
"Error Message:\t{}\nQuery:\n"
|
|
"{}".format(
|
|
ret.url,
|
|
ret.status_code,
|
|
ret.text,
|
|
alert_config['query']))
|
|
except gaierror:
|
|
logger.error(
|
|
"Unable to connect to smtp server: {}".format(
|
|
service_config['smtp_server']))
|
|
except Exception as e:
|
|
logger.error(
|
|
"Unhandled exception {} on alert: {}".format(
|
|
str(e), alert_config['id']))
|
|
finally:
|
|
sleep(alert_config['interval'])
|
|
|
|
|
|
def check_prometheus_alert(
|
|
alert_config,
|
|
service_config,
|
|
logger,
|
|
production_mode=True):
|
|
"""
|
|
Args:
|
|
alert_config (dict): Config of the alert to run
|
|
service_config (dict): Holds things like urls, tokens and other things
|
|
logger (log object): does the logging
|
|
Returns:
|
|
None
|
|
"""
|
|
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
|
|
# START AT THE SAME TIME
|
|
wait_time = random.randint(0, alert_config['interval'])
|
|
logger.info(
|
|
"ALERT_CONFIG: {}\tsleep: {}".format(
|
|
alert_config['id'],
|
|
wait_time))
|
|
sleep(wait_time)
|
|
# For metrics with availability set to true, we default the interval to 5
|
|
# mins due to Grafana limitations
|
|
availability = bool(alert_config.get('availability'))
|
|
|
|
# ====================
|
|
# EACH CHECK JUST LOOPS
|
|
# ====================
|
|
ret = None
|
|
while True:
|
|
try:
|
|
send_stat("check_run", 1, {'id': alert_config['id']})
|
|
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
|
|
ret = prom_api.query_range(
|
|
query=alert_config['query'],
|
|
start=alert_config['start_time'],
|
|
end=alert_config['end_time'],
|
|
duration=alert_config['interval'])
|
|
|
|
assert ret['status'] == 'success'
|
|
|
|
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
|
|
results = ret['data']['result']
|
|
logger.debug(
|
|
"Got back {} results for alert {}".format(
|
|
len(results), alert_config['id']))
|
|
log_alert_results(results, alert_config, logger)
|
|
alert_list = []
|
|
|
|
# LOOP THROUGH ALL THE RESULTS
|
|
for r in results:
|
|
alert_tags = (get_alert_tags(alert_config, r) if
|
|
has_custom_alert_routing(alert_config) else None)
|
|
|
|
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
|
|
r['tags'] = {key: [value]
|
|
for (key, value) in r['metric'].items()}
|
|
|
|
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
|
|
# THEREIN AND EXAMINE FOR FAILURE
|
|
if r['values']:
|
|
raw_values = [value for _, value in r['values']]
|
|
min_value = float(min(raw_values))
|
|
max_value = float(max(raw_values))
|
|
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
|
|
# AN OBJECT
|
|
alert_list.append(
|
|
build_alert_message(
|
|
alert_config,
|
|
min_value,
|
|
max_value,
|
|
r,
|
|
logger,
|
|
availability,
|
|
alert_tags=alert_tags))
|
|
|
|
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
|
|
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
|
|
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
|
|
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
|
|
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
|
|
elif 'alert_tags' in alert_config:
|
|
for key in alert_config['alert_tags']:
|
|
if ('count' not in key and 'noresult' not in key and
|
|
alert_config['alert_tags'][key] > 0):
|
|
key_noresult = key + "_noresult"
|
|
key_count = key + "_count"
|
|
if alert_config['alert_tags'][key_noresult] > 10:
|
|
logger.info("{} occurrences of no results back "
|
|
"for {}, clear out counts for tag '{}'".format(
|
|
alert_config['alert_tags'][key_noresult],
|
|
alert_config['id'], key))
|
|
alert_list.append(
|
|
build_alert_message(
|
|
alert_config,
|
|
0,
|
|
0,
|
|
None,
|
|
logger,
|
|
availability,
|
|
key,
|
|
alert_tags=alert_tags))
|
|
alert_config['alert_tags'][key] = 0
|
|
alert_config['alert_tags'][key_count] = 0
|
|
alert_config['alert_tags'][key_noresult] = 0
|
|
else:
|
|
alert_config['alert_tags'][key_noresult] += 1
|
|
logger.info("{} occurrences of no results back "
|
|
"for {}, tag '{}'".format(
|
|
alert_config['alert_tags'][key_noresult],
|
|
alert_config['id'], key))
|
|
|
|
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
|
|
for alert in [x for x in alert_list if x is not None]:
|
|
if production_mode:
|
|
send_alerts(
|
|
alert,
|
|
copy.deepcopy(alert_config),
|
|
service_config['victorops_url'],
|
|
service_config['slack_url'],
|
|
service_config['slack_token'],
|
|
service_config['smtp_server'],
|
|
service_config['sensu_endpoint'],
|
|
service_config['uchiwa_url'],
|
|
logger)
|
|
else:
|
|
logger.info(
|
|
"Sending alert {}".format(
|
|
alert_config.get('id')))
|
|
|
|
# HANDLE THE UNEXPECTED
|
|
except TimeoutError:
|
|
logger.error(
|
|
"Query [{}] took to long to run".format(
|
|
alert_config['id']))
|
|
except AssertionError:
|
|
logger.error(
|
|
"Prometheus query failed:\n"
|
|
"Status:\t{}\n"
|
|
"Error Type:\t{}\n"
|
|
"Error Message:\t{}\n"
|
|
"Query:\n{}".format(
|
|
ret['status'],
|
|
ret['errorType'],
|
|
ret['error'],
|
|
alert_config['query']))
|
|
except gaierror:
|
|
logger.error(
|
|
"Unable to connect to smtp server: {}".format(
|
|
service_config['smtp_server']))
|
|
except Exception as e:
|
|
logger.error(
|
|
"Unhandled exception {} on alert: {}".format(
|
|
str(e), alert_config['id']))
|
|
finally:
|
|
sleep(alert_config['interval'])
|
|
|
|
|
|
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
|
|
def log_alert_results(results, alert_config, logger):
|
|
"""
|
|
Logs the results broken out by tag provided in the alert_config to the
|
|
logger for debuging
|
|
Args:
|
|
results: the results object returned from the call to kairosdb, of just
|
|
the results
|
|
alert_config: config object of the alert
|
|
logger (log object): does the logging
|
|
Returns:
|
|
None, logs to logger
|
|
"""
|
|
|
|
for v in results:
|
|
logger.debug("{} - Result: {}".format(alert_config['id'], v))
|
|
|
|
|
|
def send_alerts(
|
|
alert,
|
|
alert_config,
|
|
victorops_url,
|
|
slack_url,
|
|
slack_token,
|
|
smtp_server,
|
|
sensu_endpoint,
|
|
uchiwa_url,
|
|
logger):
|
|
"""
|
|
Sends out the alerts to VO, Email, and/or Slack
|
|
Args:
|
|
alert: the alert tuple:
|
|
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
|
|
alert_config: the alert configuration object
|
|
victorops_url: url to victorops
|
|
slack_url: url to slack api calls
|
|
slack_token: the token for the alert
|
|
smtp_server: The server to send mail messages too
|
|
sensu_endpoint:
|
|
uchiwa_url:
|
|
logger (log object): does the logging
|
|
Returns: None
|
|
"""
|
|
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
|
|
# USED
|
|
tag_dict = dict()
|
|
tag_dict['alert'] = alert_config['id']
|
|
|
|
is_custom_alert_routing = has_custom_alert_routing(alert_config)
|
|
if is_custom_alert_routing:
|
|
alert_routing = alert_config.get('alert_routing_lookup', {})
|
|
alert_config['alerts'] = alert_routing.get(
|
|
alert[3], alert_config['alerts']['lookup']['default'])
|
|
|
|
# once we move all alerts into sensu, we dont need to tho this
|
|
if 'filters' in alert_config:
|
|
logger.info(
|
|
"alert_status : {}, alert_config: {}".format(
|
|
alert[2], alert_config))
|
|
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
|
|
1, 2) and alert_config['filters']['slack_subdue']:
|
|
# unless the alert is critical we dont send it
|
|
logger.info("Removed slack, alert_config: {}".format(alert_config))
|
|
alert_config['alerts'].pop('slack', None)
|
|
if ('victorops_subdue' in alert_config['filters'] and
|
|
alert[2] in (1, 2) and
|
|
alert_config['filters']['victorops_subdue']):
|
|
# unless the alert is critical we dont send it
|
|
alert_config['alerts'].pop('vo', None)
|
|
logger.info("Removed vo, alert_config: {}".format(alert_config))
|
|
|
|
# ====================
|
|
# VICTOROPS HANDLING
|
|
# ====================
|
|
if 'vo' in alert_config['alerts']:
|
|
for notify in alert_config['alerts']['vo']:
|
|
payload = dict(entity_id=alert[0],
|
|
message_type=alert_status[alert[2]],
|
|
state_message=alert[1])
|
|
r = None
|
|
try:
|
|
r = requests.post(
|
|
victorops_url + notify,
|
|
data=json.dumps(payload),
|
|
headers={
|
|
"Content-type": "application-json"})
|
|
assert r.status_code == 200
|
|
# Record a VO alert sent event
|
|
tag_dict['alert_channel_type'] = "VictorOps"
|
|
tag_dict['who'] = "vo:{}".format(notify)
|
|
send_stat("alert_channel", 1, tag_dict)
|
|
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
|
|
except AssertionError:
|
|
logger.error(
|
|
"Post to VO failed for {}\n{}:\t{}".format(
|
|
alert_config['id'], r.status_code, r.text))
|
|
except Exception as e:
|
|
logger.error("Unhandled exception for alert_id:{} "
|
|
"when posting to VO: {}".format(
|
|
alert_config['id'], str(e)))
|
|
|
|
# ====================
|
|
# EMAIL HANDLING
|
|
# ====================
|
|
if 'email' in alert_config['alerts'] and (
|
|
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
|
|
msg = MIMEText(alert[1])
|
|
msg['Subject'] = '{} Status: {}'.format(
|
|
alert[0], alert_status[alert[2]])
|
|
msg['From'] = 'aom@qualtrics.com'
|
|
msg['To'] = ','.join(
|
|
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
|
|
try:
|
|
s = smtplib.SMTP(smtp_server)
|
|
s.send_message(msg)
|
|
s.quit()
|
|
# Record an Email alert sent event
|
|
tag_dict['alert_channel_type'] = "Email"
|
|
tag_dict['who'] = "email:{}".format(msg['To'])
|
|
send_stat("alert_channel", 1, tag_dict)
|
|
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
|
|
except Exception as e:
|
|
logger.error(
|
|
"Unhandled exception when sending mail for {} to {}\n{}".format(
|
|
alert_config['id'], smtp_server, str(e)))
|
|
|
|
# ====================
|
|
# SENSU HANDLING
|
|
# ====================
|
|
if 'sensu' in alert_config['alerts']:
|
|
# Dictionary with static values for Sensu
|
|
sensu_dict = {
|
|
'source': 'AOM',
|
|
'refresh': 3600,
|
|
'occurrences': 1,
|
|
'name': alert_config['id']+'__'+alert[4]}
|
|
# if alert[3]:
|
|
# logger.info(alert)
|
|
# sensu_dict['name'] = '_'.join(
|
|
# [alert_config['id']] + sorted(list(alert[3])))
|
|
if 'refresh' in alert_config:
|
|
sensu_dict['refresh'] = alert_config['refresh']
|
|
sensu_dict['interval'] = alert_config['interval']
|
|
sensu_dict['handlers'] = []
|
|
sensu_dict['dashboard'] = alert_config['url']
|
|
if 'dependencies' in alert_config['alerts']['sensu'].keys():
|
|
sensu_dict['dependencies'] = (alert_config['alerts']
|
|
['sensu']['dependencies'])
|
|
if 'victorops' in alert_config['alerts']['sensu'].keys():
|
|
sensu_dict['handlers'].append("victorops")
|
|
sensu_dict['routing_key'] = (alert_config['alerts']
|
|
['sensu']['victorops'])
|
|
# # Leave this here until we have email support in Sensu
|
|
# if 'email' in alert_config['alerts']['sensu'].keys():
|
|
# sensu_dict['handlers'].append("email")
|
|
# # verify this option
|
|
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
|
|
if 'slack' in alert_config['alerts']['sensu'].keys():
|
|
sensu_dict['handlers'].append("slack")
|
|
sensu_dict['slack_channel'] = (
|
|
alert_config['alerts']['sensu']['slack'])
|
|
# Format alert message
|
|
sensu_dict['dashboard'] = (
|
|
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
|
|
alert_config['url'], uchiwa_url, alert_config['id']))
|
|
if 'jira' in alert_config['alerts']['sensu'].keys():
|
|
sensu_dict['handlers'].append("jira")
|
|
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
|
|
if 'filters' in alert_config:
|
|
sensu_dict['filters'] = alert_config['filters']
|
|
# 0 = OK, 1 = WARNING, 2 = CRITICAL
|
|
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
|
|
sensu_dict['status'] = sensu_status[alert[2]]
|
|
sensu_dict['output'] = alert[1]
|
|
|
|
r = None
|
|
try:
|
|
user = os.environ['API_USER']
|
|
passwd = os.environ['API_PASS']
|
|
r = requests.post(
|
|
sensu_endpoint,
|
|
json.dumps(sensu_dict),
|
|
auth=(
|
|
user,
|
|
passwd))
|
|
assert r.status_code == 202
|
|
except AssertionError:
|
|
logger.error(
|
|
"Post to Sensu failed {}\n{}:\t{}".format(
|
|
alert_config['id'],
|
|
r.status_code,
|
|
r.text))
|
|
except Exception as e:
|
|
logger.error("Unhandled exception for alert_id:{} "
|
|
"when posting to Sensu: {}".format(
|
|
alert_config['id'], str(e)))
|
|
|
|
# ====================
|
|
# SLACK HANDLING - all Slack alerts will go through Sensu
|
|
# ====================
|
|
if 'slack' in alert_config['alerts'] and (
|
|
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
|
|
refresh = alert_config.get('refresh', 3600)
|
|
dashboard = alert_config.get('url', '')
|
|
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
|
|
sensu_dict2 = {'handlers': ['slack'],
|
|
'interval': alert_config['interval'],
|
|
'source': 'AOM',
|
|
'refresh': refresh,
|
|
'occurrences': 1,
|
|
'name': alert_config['id']+'__'+alert[4],
|
|
'dashboard': dashboard,
|
|
'status': sensu_status[alert[2]],
|
|
'output': alert[1]}
|
|
if is_custom_alert_routing:
|
|
sensu_dict2['name'] = '_'.join(
|
|
[alert_config['id']] + list(alert[3]))
|
|
sensu_dict2['dashboard'] = (
|
|
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
|
|
alert_config['url'], uchiwa_url, alert_config['id']))
|
|
for channel in alert_config['alerts']['slack']:
|
|
sensu_dict2['slack_channel'] = channel
|
|
r = None
|
|
try:
|
|
user = os.environ['API_USER']
|
|
passwd = os.environ['API_PASS']
|
|
r = requests.post(
|
|
sensu_endpoint,
|
|
json.dumps(sensu_dict2),
|
|
auth=(
|
|
user,
|
|
passwd))
|
|
assert r.status_code == 202
|
|
except AssertionError:
|
|
logger.error(
|
|
"Post to Sensu failed {}\n{}:\t{}".format(
|
|
alert_config['id'], r.status_code, r.text))
|
|
except Exception as e:
|
|
logger.error("Unhandled exception for alert_id:{} when posting"
|
|
"to Sensu: {}".format(alert_config['id'], str(e)))
|
|
|
|
# payload = dict(token=slack_token, channel=channel,
|
|
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
|
|
# r = None
|
|
# try:
|
|
# r = requests.post(slack_url, data=payload)
|
|
# assert r.status_code == 200
|
|
# # Record an Slack alert sent event
|
|
# tag_dict['alert_channel_type'] = "Slack"
|
|
# tag_dict['who'] = "slack:{}".format(channel)
|
|
# send_stat("alert_channel", 1, tag_dict)
|
|
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
|
|
# except AssertionError:
|
|
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
|
|
# except Exception as e:
|
|
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
|
|
# str(e)))
|
|
|
|
|
|
def send_metrics(alert, value, result, gaugename='stats'):
|
|
"""
|
|
Sends the results from the alert check to statsd
|
|
Args:
|
|
alert: The Alert config object that holds the alert['tag'] value.
|
|
gaugename: The name of the gauge metric we send.
|
|
value: The value we want to send as a gauge.
|
|
result: The result object from making the call. Use the data in this
|
|
object to tag the metric.
|
|
Returns: None
|
|
"""
|
|
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
|
|
# SPECIFIC ALERTS
|
|
result_tags = list(itertools.chain(
|
|
*[result['tags'][x] for x in alert['tags']]))
|
|
tag_dict = dict()
|
|
for x in range(len(alert['tags'])):
|
|
tag_dict[alert['tags'][x]] = result_tags[x]
|
|
tag_dict['alert'] = alert['id']
|
|
|
|
# SEND THE METRIC
|
|
send_stat(gaugename, value, tag_dict)
|
|
|
|
|
|
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
|
|
"""Sends stats value to statsd"""
|
|
client = StatsClient('telegraf', 8125, statprefix)
|
|
|
|
# SUBMIT STATS
|
|
client.gauge(gaugename, value, tags=tag_dict)
|
|
|
|
|
|
def has_custom_alert_routing(alert_config):
|
|
"""Checks if alert has custom routing"""
|
|
return 'lookup' in alert_config['alerts']
|
|
|
|
|
|
def get_alert_tags(alert_config, query_result):
|
|
"""Retrieves custom tags from alert"""
|
|
query_tags = []
|
|
for tag in alert_config['alerts']['lookup']['tags']:
|
|
if (alert_config.get('query_type') == 'prometheus' and
|
|
'metric' in query_result and
|
|
tag in query_result['metric']):
|
|
query_tags.append(query_result['metric'][tag])
|
|
elif ('tags' in query_result and tag in query_result['tags']
|
|
and query_result['tags'][tag]):
|
|
query_tags.append(query_result['tags'][tag][0])
|
|
return tuple(query_tags)
|