""" Alert On Metrics functions""" import copy import itertools import json import os import random import smtplib from email.mime.text import MIMEText from socket import gaierror from time import sleep from hashlib import md5 import requests from statsd import StatsClient import redis alert_status = [ 'RECOVERY', 'WARNING', 'WARNING', 'CRITICAL', 'CRITICAL', 'CRITICAL'] def build_alert_message(alert, minvalue, maxvalue, result, logger, availability, tag=None, alert_tags=None): """ Build the alert message Args: alert: the alert object that includes a tag definition minvalue: the min value to test against the threshold maxvalue: the max value to test against the threshold result: the response back from kairosdb logger (log object): does the logging availability: Send availability stat 1 tag: If passed in will use this value for the tag instead of getting it from the result object alert_tags: the tags corresponding to the result, used if an alert has to be triggered and a custom routing per tag is configured Returns: Alert message string """ # DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW # MAY CHANGE THIS. value = maxvalue # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY # (USUALLY A GLOBAL ALL-DC QUERY) if tag is None and result is not None: tag = ', '.join(sorted(list(itertools.chain( *[result['tags'][x] for x in alert['tags']])))) tag_count = tag + "_count" # WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT # RETURNING RESULTS tag_noresult = tag + "_noresult" if not tag: tag = 'instance' logger.debug("No tag specified for alert {}".format(alert['id'])) # INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT # THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL # THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED if 'alert_tags' not in alert: alert['alert_tags'] = {} if tag not in alert['alert_tags']: alert['alert_tags'][tag] = 0 if tag_count not in alert['alert_tags']: alert['alert_tags'][tag_count] = 0 # IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT # COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM # KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE # CLEARING EVERYTHING OUT ANYWAY alert['alert_tags'][tag_noresult] = 0 # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED upper_critical_threshold = None upper_warning_threshold = None lower_warning_threshold = None lower_critical_threshold = None upper_threshold = None lower_threshold = None is_warning_alarm = False is_critical_alarm = False # UPPER upper_threshold_exists = False upper_warning_threshold_breached = False upper_critical_threshold_breached = False if 'warning_upper_threshold' in alert: upper_threshold_exists = True upper_warning_threshold = alert['warning_upper_threshold'] upper_threshold = upper_warning_threshold if maxvalue >= upper_warning_threshold: upper_warning_threshold_breached = True is_warning_alarm = True if 'critical_upper_threshold' in alert: upper_critical_threshold = alert['critical_upper_threshold'] if not upper_threshold_exists: upper_threshold = upper_critical_threshold upper_threshold_exists = True # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS # OUR THRESHOLD FOR ALERTING if maxvalue >= alert['critical_upper_threshold']: upper_threshold = upper_critical_threshold upper_critical_threshold_breached = True is_critical_alarm = True upper_threshold_breached = (upper_warning_threshold_breached or upper_critical_threshold_breached) # LOWER lower_threshold_exists = False lower_warning_threshold_breached = False lower_critical_threshold_breached = False if 'warning_lower_threshold' in alert: lower_threshold_exists = True lower_warning_threshold = alert['warning_lower_threshold'] lower_threshold = lower_warning_threshold if minvalue <= lower_warning_threshold: lower_warning_threshold_breached = True is_warning_alarm = True if 'critical_lower_threshold' in alert: lower_critical_threshold = alert['critical_lower_threshold'] if not lower_threshold_exists: lower_threshold = lower_critical_threshold lower_threshold_exists = True # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS # OUR THRESHOLD FOR ALERTING if minvalue <= lower_critical_threshold: lower_threshold = lower_critical_threshold lower_critical_threshold_breached = True is_critical_alarm = True lower_threshold_breached = (lower_warning_threshold_breached or lower_critical_threshold_breached) # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG if lower_threshold is None and upper_threshold is None: logger.debug( "ERROR: alert {} does not have any thresholds set on {}".format( alert['id'], tag)) # ON TO OCCURRENCES if 'occurrences_threshold' in alert: occurrences_threshold = alert['occurrences_threshold'] else: occurrences_threshold = 1 alert_entity = "Metric: {} for {}".format(alert['id'], tag) if 'url' not in alert: alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id']) # ==================== # PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH # ==================== alert_body = '' if upper_threshold_breached: alert_body = "{}\n{:.2f} >= {}\n{}".format( alert_entity, value, upper_threshold, alert['url']) if lower_threshold_breached: value = minvalue alert_body = "{}\n{:.2f} <= {}\n{}".format( alert_entity, value, lower_threshold, alert['url']) # SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE # THRESHOLDS TOO SO THEY CAN BE GRAPHED if result is not None: send_metrics(alert, value, result) if 'critical_upper_threshold' in alert: send_stat('upper_critical_threshold', upper_critical_threshold, {'id': alert['id']}) if 'warning_upper_threshold' in alert: send_stat('upper_warning_threshold', upper_warning_threshold, {'id': alert['id']}) if 'critical_lower_threshold' in alert: send_stat('lower_critical_threshold', lower_critical_threshold, {'id': alert['id']}) if 'warning_lower_threshold' in alert: send_stat('lower_warning_threshold', lower_warning_threshold, {'id': alert['id']}) # NO RESULT OVERRIDES ALL if result is None: lower_threshold_breached = False upper_threshold_breached = False # ==================== # APPLY OUR LOGIC TO MAKE SOME DECISIONS # ==================== current_alert_status = alert_status[0] if not lower_threshold_breached and not upper_threshold_breached: if result is not None: if lower_threshold_exists and not upper_threshold_exists: alert_body = "{}\n{:.2f} > {}\n{}".format( alert_entity, value, lower_threshold, alert['url']) logger.debug("GOOD: alert {} is higher than lower threshold {}" "for value {} on tag {}".format( alert['id'], lower_threshold, value, tag)) if upper_threshold_exists and not lower_threshold_exists: alert_body = "{}\n{:.2f} < {}\n{}".format( alert_entity, value, upper_threshold, alert['url']) logger.debug("GOOD: alert {} is below the upper threshold {} " "for value {} on tag {}".format( alert['id'], upper_threshold, value, tag)) if upper_threshold_exists and lower_threshold_exists: alert_body = "{}\n{} < {:.2f} < {}\n{}".format( alert_entity, lower_threshold, value, upper_threshold, alert['url']) logger.debug("GOOD: alert {} is between thresholds {} and {} " "for value {} on tag {}".format( alert['id'], upper_threshold, lower_threshold, value, tag)) # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM # STATE if alert['alert_tags'][tag] > 0: if result is not None: send_metrics(alert, 1, result, current_alert_status) logger.info( "TestInfo: RECOVERY: Clearing values for [{}] - {}".format( alert['id'], tag)) if result is None: alert_body = ("{} RECOVERY due to no results found from " "KairosDB query. Recommend you manually validate" "recovery.\n{}").format( alert_entity, alert['url']) alert['alert_tags'][tag] = 0 alert['alert_tags'][tag_count] = 0 if availability: logger.info("Sending availability stat 1") send_metrics(alert, 1, result, 'service_level') else: # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR # CRITICAL) NEEDS TO BE FIRED alert['alert_tags'][tag_count] = 0 if availability: logger.info("Sending availability stat 1") send_metrics(alert, 1, result, 'service_level') return None else: # ==================== # SET KEY / VALUE FOR TAG ON ALERT # 0 == No Alert # 1 == Warning # 2 == Existing Warning Alert # 3 == New Critical # 4+ == Existing Critical Alert # ==================== # CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT alert['alert_tags'][tag_count] += 1 # ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES, # THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED # OCCURRENCES SO RETURN IT if alert['alert_tags'][tag_count] >= occurrences_threshold: # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP if alert['alert_tags'][tag] < 4: if is_warning_alarm and not is_critical_alarm: # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL if alert['alert_tags'][tag] == 0: # NEW WARNING alert['alert_tags'][tag] = 1 logger.info("TestInfo: WARNING (NEW): {} - {}".format( alert['id'], tag)) else: # EXISTING WARNING alert['alert_tags'][tag] = 2 logger.info("TestInfo: WARNING (EXISTING): {} - {}".format( alert['id'], tag)) if is_critical_alarm: # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL if (alert['alert_tags'][tag] == 1 or alert['alert_tags'][tag] == 2): alert['alert_tags'][tag] = 3 logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format( alert['id'], tag)) else: # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL # LEVEL if alert['alert_tags'][tag] < 3: # NEW CRITICAL alert['alert_tags'][tag] = 3 logger.info("TestInfo: CRITICAL (NEW): {} - {}".format( alert['id'], tag)) else: # EXISTING CRITICAL alert['alert_tags'][tag] = 4 logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format( alert['id'], tag)) # RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS # EVEN IF NOT ACTIVELY ALERTING ON IT if is_critical_alarm: current_alert_status = alert_status[3] send_metrics(alert, 2, result, current_alert_status) if availability: logger.info("Sending availability stat 0") send_metrics(alert, 0, result, 'service_level') if is_warning_alarm and not is_critical_alarm: current_alert_status = alert_status[1] send_metrics(alert, 1, result, current_alert_status) if availability: logger.info("Sending availability stat 1") send_metrics(alert, 1, result, 'service_level') logger.debug("{} alert for value {} of {} for tag {} has occurred " "{} times. Threshold is >= {} times.".format( current_alert_status, value, alert['id'], tag, alert['alert_tags'][tag_count], occurrences_threshold)) else: # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR # CRITICAL) NEEDS TO BE FIRED logger.debug("Value {} of {} for tag {} has occurred {} time(s) < " "threshold of {}".format( value, alert['id'], tag, alert['alert_tags'][tag_count], occurrences_threshold)) if availability: logger.info("Sending availability stat") send_metrics(alert, 1, result, 'service_level') return None logger.debug( "Alert {}->[{}]->{}, Occurrences={}".format( alert['id'], tag, current_alert_status, alert['alert_tags'][tag_count])) return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10] def check_kairosdb_alert( alert_config, service_config, logger, production_mode=True): from library.prom_api import PromAPI """ Args: alert_config (dict): Config of the alert to run service_config (dict): Holds things like urls, tokens and other things logger (log object): does the logging Returns: None """ availability = False # SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T # START AT THE SAME TIME wait_time = random.randint(0, alert_config['interval']) logger.info( "ALERT_CONFIG: {}\tsleep: {}".format( alert_config['id'], wait_time)) sleep(wait_time) # For metrics with availability set to true, we default the interval to 5 # mins due Grafana limitations if 'availability' in alert_config and alert_config['availability']: availability = True # ==================== # EACH CHECK JUST LOOPS # ==================== ret = None while True: try: send_stat("check_run", 1, {'id': alert_config['id']}) # BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS query_url = os.path.join( service_config['kairosdb_url'] + "api/v1/datapoints/query") ret = requests.post( query_url, data=json.dumps( alert_config['query']), timeout=service_config['timeout']) assert ret.status_code == 200 # GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD results = ret.json()['queries'][0]['results'] logger.debug( "Got back {} results for alert {}".format( len(results), alert_config['id'])) log_alert_results(results, alert_config, logger) alert_list = [] # LOOP THROUGH ALL THE RESULTS for r in results: alert_tags = (get_alert_tags(alert_config, r) if has_custom_alert_routing(alert_config) else None) # OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES # THEREIN AND EXAMINE FOR FAILURE if r['values']: minvalue = min([x[1] for x in r['values']]) maxvalue = max([x[1] for x in r['values']]) # SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR # AN OBJECT alert_list.append( build_alert_message( alert_config, minvalue, maxvalue, r, logger, availability, alert_tags=alert_tags)) # THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK # ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10) # AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR # AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A # LATER OCCURRENCE CAUSING A PREMATURE ALERT. # A NO-OP IF NO HISTORY. elif 'alert_tags' in alert_config: for key in alert_config['alert_tags']: if ('count' not in key and 'noresult' not in key and alert_config['alert_tags'][key] > 0): key_noresult = key + "_noresult" key_count = key + "_count" if alert_config['alert_tags'][key_noresult] > 10: logger.info("{} occurrences of no results back " "for {}, clear out counts for tag '{}'".format( alert_config['alert_tags'][key_noresult], alert_config['id'], key)) alert_list.append( build_alert_message( alert_config, 0, 0, None, logger, availability, key, alert_tags=alert_tags)) alert_config['alert_tags'][key] = 0 alert_config['alert_tags'][key_count] = 0 alert_config['alert_tags'][key_noresult] = 0 else: alert_config['alert_tags'][key_noresult] += 1 logger.info("{} occurrences of no results back " "for {}, tag '{}'".format( alert_config['alert_tags'][key_noresult], alert_config['id'], key)) # SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE alert_list = [x for x in alert_list if not x is None] set_firing(alert_config['id'], alert_list) clear_suppressed(alert_config, [alert[3] for alert in alert_list]) for alert in alert_list : if production_mode or True: send_alerts( alert, copy.deepcopy(alert_config), service_config['victorops_url'], service_config['slack_url'], service_config['slack_token'], service_config['smtp_server'], service_config['sensu_endpoint'], service_config['uchiwa_url'], logger) else: logger.info( "Sending alert for: {}".format( alert_config.get('id'))) # HANDLE THE UNEXPECTED except TimeoutError: logger.error("Query [{}] took to long to run".format( alert_config['id'])) except AssertionError: logger.error( "KairsoDB query failed: {}\n" "HTTP status code:\t{}\n" "Error Message:\t{}\nQuery:\n" "{}".format( ret.url, ret.status_code, ret.text, alert_config['query'])) except gaierror: logger.error( "Unable to connect to smtp server: {}".format( service_config['smtp_server'])) except SuppressedException as e : logger.warn( "Skipping alert check {} as it's suppressed: {}".format( alert_config['id'], e )) except Exception as e: logger.error( "Unhandled exception {} on alert: {}".format( str(e), alert_config['id'])) finally: sleep(alert_config['interval']) def stringify_alert_tags(alert_tags) : if not alert_tags : return "-" for i in alert_tags : if i.lower() == "dc" or i.lower() == "datacenter" : return str(alert_tags[i]) return "-" def is_suppressed( alert_config, alert_tags) : ret = False for dependency in alert_config['resolvedDependencies'].getDependencies() : if get_firing(dependency, alert_tags) : ret = True if not is_within_threshold(alert_config, alert_tags) : ret = False print("is_suppressed(", alert_config['id'], alert_tags, ") =", ret) return ret def get_firing(id, alert_tags) : firing = "{}\\{}".format(id, stringify_alert_tags(alert_tags)) try : redis = get_redis_client() status = redis.get(firing) if status and "decode" in dir(status) : status = status.decode() status = status if status else "ok" return status != "ok" except Exception as e : print(e) return False def get_redis_client() : try : redis = redis.Redis() return redis except Exception : return MockRedis() class MockRedis(): def __init__(self) : return def get(self, key) : return None def set(self, key, value) : return def delete(self, key) : return def call(self, *args, **kwargs) : return [] def set_firing(id, active_fires) : prefix = "{}\\".format(id) previously_firing = list_firing(prefix) should_fire = [] for active_fire in active_fires : alert_tags = active_fire[3] key = "{}{}".format(prefix, stringify_alert_tags(alert_tags)) should_fire.append(key) for i in previously_firing : if not i in should_fire : set_not_firing(i) for i in should_fire : if not i in previously_firing : set_is_firing(i) def list_firing(prefix) : try : redis = get_redis_client() resp = redis.call("KEYS", prefix+"*") return resp except Exception as e : return [] def set_not_firing(id) : redis = get_redis_client() redis.delete(id) def set_is_firing(id) : redis = get_redis_client() redis.set(id, "bad") def is_within_threshold(alert_config, alert_tags) : count = inc_suppressed(alert_config, alert_tags) threshold = alert_config['suppressed_occurrences_threshold'] if 'suppressed_occurrences_threshold' in alert_config else 9000000000000 return count < threshold def inc_suppressed(alert_config, alert_tags) : key = stringify_alert_tags(alert_tags) if not 'suppressed_occurrences' in alert_config : alert_config['suppressed_occurrences'] = {} if not key in alert_config['suppressed_occurrences'] : alert_config['suppressed_occurrences'][key] = 0 alert_config['suppressed_occurrences'][key] += 1 return alert_config['suppressed_occurrences'][key] def clear_suppressed(alert_config, all_alert_tags) : for alert_tags in all_alert_tags: key = stringify_alert_tags(alert_tags) if not 'suppressed_occurrences' in alert_config : continue if not key in alert_config['suppressed_occurrences'] : continue del(alert_config['suppressed_occurrences'][key]) def check_prometheus_alert( alert_config, service_config, logger, production_mode=True): from library.prom_api import PromAPI """ Args: alert_config (dict): Config of the alert to run service_config (dict): Holds things like urls, tokens and other things logger (log object): does the logging Returns: None """ # SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T # START AT THE SAME TIME num_dependencies = len(alert_config['resolvedDependencies'].getDependencies()) wait_time = random.randint(num_dependencies, alert_config['interval'] + num_dependencies) logger.info( "ALERT_CONFIG: {}\tsleep: {}".format( alert_config['id'], wait_time)) sleep(wait_time) # For metrics with availability set to true, we default the interval to 5 # mins due to Grafana limitations availability = bool(alert_config.get('availability')) # ==================== # EACH CHECK JUST LOOPS # ==================== ret = None while True: try: send_stat("check_run", 1, {'id': alert_config['id']}) prom_api = PromAPI(endpoint=alert_config['prometheus_url']) ret = prom_api.query_range( query=alert_config['query'], start=alert_config['start_time'], end=alert_config['end_time'], duration=alert_config['interval']) assert ret['status'] == 'success' # GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD results = ret['data']['result'] logger.debug( "Got back {} results for alert {}".format( len(results), alert_config['id'])) log_alert_results(results, alert_config, logger) alert_list = [] # LOOP THROUGH ALL THE RESULTS for r in results: alert_tags = (get_alert_tags(alert_config, r) if has_custom_alert_routing(alert_config) else None) # REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT r['tags'] = {key: [value] for (key, value) in r['metric'].items()} # OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES # THEREIN AND EXAMINE FOR FAILURE if r['values']: raw_values = [value for _, value in r['values']] min_value = float(min(raw_values)) max_value = float(max(raw_values)) # SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR # AN OBJECT alert_list.append( build_alert_message( alert_config, min_value, max_value, r, logger, availability, alert_tags=alert_tags)) # THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES # WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10) # AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S # BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER # OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY. elif 'alert_tags' in alert_config: for key in alert_config['alert_tags']: if ('count' not in key and 'noresult' not in key and alert_config['alert_tags'][key] > 0): key_noresult = key + "_noresult" key_count = key + "_count" if alert_config['alert_tags'][key_noresult] > 10: logger.info("{} occurrences of no results back " "for {}, clear out counts for tag '{}'".format( alert_config['alert_tags'][key_noresult], alert_config['id'], key)) alert_list.append( build_alert_message( alert_config, 0, 0, None, logger, availability, key, alert_tags=alert_tags)) alert_config['alert_tags'][key] = 0 alert_config['alert_tags'][key_count] = 0 alert_config['alert_tags'][key_noresult] = 0 else: alert_config['alert_tags'][key_noresult] += 1 logger.info("{} occurrences of no results back " "for {}, tag '{}'".format( alert_config['alert_tags'][key_noresult], alert_config['id'], key)) # SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE alert_list = [x for x in alert_list if not x is None] set_firing(alert_config['id'], alert_list) clear_suppressed(alert_config, [alert[3] for alert in alert_list]) for alert in alert_list : if production_mode or True: send_alerts( alert, copy.deepcopy(alert_config), service_config['victorops_url'], service_config['slack_url'], service_config['slack_token'], service_config['smtp_server'], service_config['sensu_endpoint'], service_config['uchiwa_url'], logger) else: logger.info( "Sending alert {}".format( alert_config.get('id'))) # HANDLE THE UNEXPECTED except TimeoutError: logger.error( "Query [{}] took to long to run".format( alert_config['id'])) except AssertionError: logger.error( "Prometheus query failed:\n" "Status:\t{}\n" "Error Type:\t{}\n" "Error Message:\t{}\n" "Query:\n{}".format( ret['status'], ret['errorType'], ret['error'], alert_config['query'])) except gaierror: logger.error( "Unable to connect to smtp server: {}".format( service_config['smtp_server'])) except Exception as e: logger.error( "Unhandled exception {} on alert: {}".format( str(e), alert_config['id'])) finally: sleep(alert_config['interval']) # LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED def log_alert_results(results, alert_config, logger): """ Logs the results broken out by tag provided in the alert_config to the logger for debuging Args: results: the results object returned from the call to kairosdb, of just the results alert_config: config object of the alert logger (log object): does the logging Returns: None, logs to logger """ for v in results: logger.debug("{} - Result: {}".format(alert_config['id'], v)) def send_alerts( alert, alert_config, victorops_url, slack_url, slack_token, smtp_server, sensu_endpoint, uchiwa_url, logger): """ Sends out the alerts to VO, Email, and/or Slack Args: alert: the alert tuple: alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum alert_config: the alert configuration object victorops_url: url to victorops slack_url: url to slack api calls slack_token: the token for the alert smtp_server: The server to send mail messages too sensu_endpoint: uchiwa_url: logger (log object): does the logging Returns: None """ # GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS # USED tag_dict = dict() tag_dict['alert'] = alert_config['id'] is_custom_alert_routing = has_custom_alert_routing(alert_config) if is_custom_alert_routing: alert_routing = alert_config.get('alert_routing_lookup', {}) alert_config['alerts'] = alert_routing.get( alert[3], alert_config['alerts']['lookup']['default']) # once we move all alerts into sensu, we dont need to tho this if 'filters' in alert_config: logger.info( "alert_status : {}, alert_config: {}".format( alert[2], alert_config)) if 'slack_subdue' in alert_config['filters'] and alert[2] in ( 1, 2) and alert_config['filters']['slack_subdue']: # unless the alert is critical we dont send it logger.info("Removed slack, alert_config: {}".format(alert_config)) alert_config['alerts'].pop('slack', None) if ('victorops_subdue' in alert_config['filters'] and alert[2] in (1, 2) and alert_config['filters']['victorops_subdue']): # unless the alert is critical we dont send it alert_config['alerts'].pop('vo', None) logger.info("Removed vo, alert_config: {}".format(alert_config)) # ==================== # VICTOROPS HANDLING # ==================== if 'vo' in alert_config['alerts'] and not is_suppressed(alert_config, alert[3]) : for notify in alert_config['alerts']['vo']: payload = dict(entity_id=alert[0], message_type=alert_status[alert[2]], state_message=alert[1]) r = None try: r = requests.post( victorops_url + notify, data=json.dumps(payload), headers={ "Content-type": "application-json"}) assert r.status_code == 200 # Record a VO alert sent event tag_dict['alert_channel_type'] = "VictorOps" tag_dict['who'] = "vo:{}".format(notify) send_stat("alert_channel", 1, tag_dict) # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0])) except AssertionError: logger.error( "Post to VO failed for {}\n{}:\t{}".format( alert_config['id'], r.status_code, r.text)) except Exception as e: logger.error("Unhandled exception for alert_id:{} " "when posting to VO: {}".format( alert_config['id'], str(e))) # ==================== # EMAIL HANDLING # ==================== if 'email' in alert_config['alerts'] and ( alert[2] == 0 or alert[2] == 1 or alert[2] == 3): msg = MIMEText(alert[1]) msg['Subject'] = '{} Status: {}'.format( alert[0], alert_status[alert[2]]) msg['From'] = 'aom@qualtrics.com' msg['To'] = ','.join( [x + "@qualtrics.com" for x in alert_config['alerts']['email']]) try: s = smtplib.SMTP(smtp_server) s.send_message(msg) s.quit() # Record an Email alert sent event tag_dict['alert_channel_type'] = "Email" tag_dict['who'] = "email:{}".format(msg['To']) send_stat("alert_channel", 1, tag_dict) # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0])) except Exception as e: logger.error( "Unhandled exception when sending mail for {} to {}\n{}".format( alert_config['id'], smtp_server, str(e))) # ==================== # SENSU HANDLING # ==================== if 'sensu' in alert_config['alerts']: # Dictionary with static values for Sensu sensu_dict = { 'source': 'AOM', 'refresh': 3600, 'occurrences': 1, 'name': alert_config['id']+'__'+alert[4]} # if alert[3]: # logger.info(alert) # sensu_dict['name'] = '_'.join( # [alert_config['id']] + sorted(list(alert[3]))) if 'refresh' in alert_config: sensu_dict['refresh'] = alert_config['refresh'] sensu_dict['interval'] = alert_config['interval'] sensu_dict['handlers'] = [] sensu_dict['dashboard'] = alert_config['url'] if 'dependencies' in alert_config['alerts']['sensu'].keys(): sensu_dict['dependencies'] = (alert_config['alerts'] ['sensu']['dependencies']) if 'victorops' in alert_config['alerts']['sensu'].keys() and not is_suppressed(alert_config, alert[3]) : sensu_dict['handlers'].append("victorops") sensu_dict['routing_key'] = (alert_config['alerts'] ['sensu']['victorops']) # # Leave this here until we have email support in Sensu # if 'email' in alert_config['alerts']['sensu'].keys(): # sensu_dict['handlers'].append("email") # # verify this option # sensu_dict['email'] = alert_config['alerts']['sensu']['email'] if 'slack' in alert_config['alerts']['sensu'].keys(): sensu_dict['handlers'].append("slack") sensu_dict['slack_channel'] = ( alert_config['alerts']['sensu']['slack']) # Format alert message sensu_dict['dashboard'] = ( "<{}|here> , Uchiwa: <{}?check={}|here> ".format( alert_config['url'], uchiwa_url, alert_config['id'])) if 'jira' in alert_config['alerts']['sensu'].keys(): sensu_dict['handlers'].append("jira") sensu_dict.update(alert_config['alerts']['sensu']['jira']) if 'filters' in alert_config: sensu_dict['filters'] = alert_config['filters'] # 0 = OK, 1 = WARNING, 2 = CRITICAL sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2} sensu_dict['status'] = sensu_status[alert[2]] sensu_dict['output'] = alert[1] r = None try: user = os.environ['API_USER'] passwd = os.environ['API_PASS'] r = requests.post( sensu_endpoint, json.dumps(sensu_dict), auth=( user, passwd)) assert r.status_code == 202 except AssertionError: logger.error( "Post to Sensu failed {}\n{}:\t{}".format( alert_config['id'], r.status_code, r.text)) except Exception as e: logger.error("Unhandled exception for alert_id:{} " "when posting to Sensu: {}".format( alert_config['id'], str(e))) # ==================== # SLACK HANDLING - all Slack alerts will go through Sensu # ==================== if 'slack' in alert_config['alerts'] and ( alert[2] == 0 or alert[2] == 1 or alert[2] == 3): refresh = alert_config.get('refresh', 3600) dashboard = alert_config.get('url', '') sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2} sensu_dict2 = {'handlers': ['slack'], 'interval': alert_config['interval'], 'source': 'AOM', 'refresh': refresh, 'occurrences': 1, 'name': alert_config['id']+'__'+alert[4], 'dashboard': dashboard, 'status': sensu_status[alert[2]], 'output': alert[1]} if is_custom_alert_routing: sensu_dict2['name'] = '_'.join( [alert_config['id']] + list(alert[3])) sensu_dict2['dashboard'] = ( "<{}|here> , Uchiwa: <{}?check={}|here> ".format( alert_config['url'], uchiwa_url, alert_config['id'])) for channel in alert_config['alerts']['slack']: sensu_dict2['slack_channel'] = channel r = None try: user = os.environ['API_USER'] passwd = os.environ['API_PASS'] r = requests.post( sensu_endpoint, json.dumps(sensu_dict2), auth=( user, passwd)) assert r.status_code == 202 except AssertionError: logger.error( "Post to Sensu failed {}\n{}:\t{}".format( alert_config['id'], r.status_code, r.text)) except Exception as e: logger.error("Unhandled exception for alert_id:{} when posting" "to Sensu: {}".format(alert_config['id'], str(e))) # payload = dict(token=slack_token, channel=channel, # text="{} Status: {}".format(alert[1], alert_status[alert[2]])) # r = None # try: # r = requests.post(slack_url, data=payload) # assert r.status_code == 200 # # Record an Slack alert sent event # tag_dict['alert_channel_type'] = "Slack" # tag_dict['who'] = "slack:{}".format(channel) # send_stat("alert_channel", 1, tag_dict) # # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0])) # except AssertionError: # logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text)) # except Exception as e: # logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'], # str(e))) def send_metrics(alert, value, result, gaugename='stats'): """ Sends the results from the alert check to statsd Args: alert: The Alert config object that holds the alert['tag'] value. gaugename: The name of the gauge metric we send. value: The value we want to send as a gauge. result: The result object from making the call. Use the data in this object to tag the metric. Returns: None """ # GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR # SPECIFIC ALERTS result_tags = list(itertools.chain( *[result['tags'][x] for x in alert['tags']])) tag_dict = dict() for x in range(len(alert['tags'])): tag_dict[alert['tags'][x]] = result_tags[x] tag_dict['alert'] = alert['id'] # SEND THE METRIC send_stat(gaugename, value, tag_dict) def send_stat(gaugename, value, tag_dict, statprefix='aom'): """Sends stats value to statsd""" client = StatsClient('telegraf', 8125, statprefix) # SUBMIT STATS client.gauge(gaugename, value, tags=tag_dict) def has_custom_alert_routing(alert_config): """Checks if alert has custom routing""" return 'lookup' in alert_config['alerts'] def get_alert_tags(alert_config, query_result): """Retrieves custom tags from alert""" query_tags = {} for tag in alert_config['alerts']['lookup']['tags']: if (alert_config.get('query_type') == 'prometheus' and 'metric' in query_result and tag in query_result['metric']): query_tags[tag] = query_result['metric'][tag] elif ('tags' in query_result and tag in query_result['tags'] and query_result['tags'][tag]): query_tags[tag] = query_result['metric'][tag][0] return query_tags class SuppressedException(Exception) : pass