QVolution2019.2/sleeper_agents_aom_engine/serviceapp/service.py

""" Alert On Metrics functions"""

import copy
import itertools
import json
import os
import random
import smtplib
from email.mime.text import MIMEText
from socket import gaierror
from time import sleep
from hashlib import md5
import requests
from statsd import StatsClient
import redis

alert_status = [
    'RECOVERY',
    'WARNING',
    'WARNING',
    'CRITICAL',
    'CRITICAL',
    'CRITICAL']

def build_alert_message(alert, minvalue, maxvalue, result, logger,
                        availability, tag=None, alert_tags=None):
    """
    Build the alert message
    Args:
        alert: the alert object that includes a tag definition
        minvalue: the min value to test against the threshold
        maxvalue: the max value to test against the threshold
        result: the response back from kairosdb
        logger (log object): does the logging
        availability: Send availability stat 1
        tag: If passed in will use this value for the tag instead of
        getting it from the result object
        alert_tags: the tags corresponding to the result, used if an
        alert has to be triggered and a custom routing per tag is configured
    Returns:
        Alert message string
    """
    # DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
    # MAY CHANGE THIS.
    value = maxvalue
    # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
    # (USUALLY A GLOBAL ALL-DC QUERY)
    if tag is None and result is not None:
        tag = ', '.join(sorted(list(itertools.chain(
            *[result['tags'][x] for x in alert['tags']]))))
    tag_count = tag + "_count"
    # WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
    # RETURNING RESULTS
    tag_noresult = tag + "_noresult"
    if not tag:
        tag = 'instance'
        logger.debug("No tag specified for alert {}".format(alert['id']))
    # INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
    # THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
    # THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
    if 'alert_tags' not in alert:
        alert['alert_tags'] = {}
    if tag not in alert['alert_tags']:
        alert['alert_tags'][tag] = 0
    if tag_count not in alert['alert_tags']:
        alert['alert_tags'][tag_count] = 0
    # IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
    # COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
    # KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
    # CLEARING EVERYTHING OUT ANYWAY
    alert['alert_tags'][tag_noresult] = 0

    # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
    upper_critical_threshold = None
    upper_warning_threshold = None
    lower_warning_threshold = None
    lower_critical_threshold = None
    upper_threshold = None
    lower_threshold = None
    is_warning_alarm = False
    is_critical_alarm = False

    # UPPER
    upper_threshold_exists = False
    upper_warning_threshold_breached = False
    upper_critical_threshold_breached = False
    if 'warning_upper_threshold' in alert:
        upper_threshold_exists = True
        upper_warning_threshold = alert['warning_upper_threshold']
        upper_threshold = upper_warning_threshold
        if maxvalue >= upper_warning_threshold:
            upper_warning_threshold_breached = True
            is_warning_alarm = True
    if 'critical_upper_threshold' in alert:
        upper_critical_threshold = alert['critical_upper_threshold']
        if not upper_threshold_exists:
            upper_threshold = upper_critical_threshold
        upper_threshold_exists = True
        # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
        # OUR THRESHOLD FOR ALERTING
        if maxvalue >= alert['critical_upper_threshold']:
            upper_threshold = upper_critical_threshold
            upper_critical_threshold_breached = True
            is_critical_alarm = True
    upper_threshold_breached = (upper_warning_threshold_breached
                                or upper_critical_threshold_breached)

    # LOWER
    lower_threshold_exists = False
    lower_warning_threshold_breached = False
    lower_critical_threshold_breached = False
    if 'warning_lower_threshold' in alert:
        lower_threshold_exists = True
        lower_warning_threshold = alert['warning_lower_threshold']
        lower_threshold = lower_warning_threshold
        if minvalue <= lower_warning_threshold:
            lower_warning_threshold_breached = True
            is_warning_alarm = True
    if 'critical_lower_threshold' in alert:
        lower_critical_threshold = alert['critical_lower_threshold']
        if not lower_threshold_exists:
            lower_threshold = lower_critical_threshold
        lower_threshold_exists = True
        # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
        # OUR THRESHOLD FOR ALERTING
        if minvalue <= lower_critical_threshold:
            lower_threshold = lower_critical_threshold
            lower_critical_threshold_breached = True
            is_critical_alarm = True
    lower_threshold_breached = (lower_warning_threshold_breached or
                                lower_critical_threshold_breached)

    # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
    if lower_threshold is None and upper_threshold is None:
        logger.debug(
            "ERROR: alert {} does not have any thresholds set on {}".format(
                alert['id'], tag))

    # ON TO OCCURRENCES
    if 'occurrences_threshold' in alert:
        occurrences_threshold = alert['occurrences_threshold']
    else:
        occurrences_threshold = 1

    alert_entity = "Metric: {} for {}".format(alert['id'], tag)

    if 'url' not in alert:
        alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])

    # ====================
    # PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
    # ====================
    alert_body = ''
    if upper_threshold_breached:
        alert_body = "{}\n{:.2f} >= {}\n{}".format(
            alert_entity, value, upper_threshold, alert['url'])
    if lower_threshold_breached:
        value = minvalue
        alert_body = "{}\n{:.2f} <= {}\n{}".format(
            alert_entity, value, lower_threshold, alert['url'])

    # SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
    # THRESHOLDS TOO SO THEY CAN BE GRAPHED
    if result is not None:
        send_metrics(alert, value, result)
        if 'critical_upper_threshold' in alert:
            send_stat('upper_critical_threshold', upper_critical_threshold,
                      {'id': alert['id']})
        if 'warning_upper_threshold' in alert:
            send_stat('upper_warning_threshold', upper_warning_threshold,
                      {'id': alert['id']})
        if 'critical_lower_threshold' in alert:
            send_stat('lower_critical_threshold', lower_critical_threshold,
                      {'id': alert['id']})
        if 'warning_lower_threshold' in alert:
            send_stat('lower_warning_threshold', lower_warning_threshold,
                      {'id': alert['id']})

    # NO RESULT OVERRIDES ALL
    if result is None:
        lower_threshold_breached = False
        upper_threshold_breached = False

    # ====================
    # APPLY OUR LOGIC TO MAKE SOME DECISIONS
    # ====================
    current_alert_status = alert_status[0]
    if not lower_threshold_breached and not upper_threshold_breached:
        if result is not None:
            if lower_threshold_exists and not upper_threshold_exists:
                alert_body = "{}\n{:.2f} > {}\n{}".format(
                    alert_entity, value, lower_threshold, alert['url'])
                logger.debug("GOOD: alert {} is higher than lower threshold {}"
                             "for value {} on tag {}".format(
                                 alert['id'], lower_threshold, value, tag))
            if upper_threshold_exists and not lower_threshold_exists:
                alert_body = "{}\n{:.2f} < {}\n{}".format(
                    alert_entity, value, upper_threshold, alert['url'])
                logger.debug("GOOD: alert {} is below the upper threshold {} "
                             "for value {} on tag {}".format(
                                 alert['id'], upper_threshold, value, tag))
            if upper_threshold_exists and lower_threshold_exists:
                alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
                    alert_entity, lower_threshold, value, upper_threshold,
                    alert['url'])
                logger.debug("GOOD: alert {} is between thresholds {} and {} "
                             "for value {} on tag {}".format(
                                 alert['id'], upper_threshold, lower_threshold,
                                 value, tag))
        # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
        # STATE
        if alert['alert_tags'][tag] > 0:
            if result is not None:
                send_metrics(alert, 1, result, current_alert_status)
            logger.info(
                "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
                    alert['id'], tag))
            if result is None:
                alert_body = ("{} RECOVERY due to no results found from "
                              "KairosDB query. Recommend you manually validate"
                              "recovery.\n{}").format(
                                  alert_entity, alert['url'])
            alert['alert_tags'][tag] = 0
            alert['alert_tags'][tag_count] = 0
            if availability:
                logger.info("Sending availability stat 1")
                send_metrics(alert, 1, result, 'service_level')
        else:
            # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
            # CRITICAL) NEEDS TO BE FIRED
            alert['alert_tags'][tag_count] = 0
            if availability:
                logger.info("Sending availability stat 1")
                send_metrics(alert, 1, result, 'service_level')
            return None
    else:
        # ====================
        # SET KEY / VALUE FOR TAG ON ALERT
        # 0 == No Alert
        # 1 == Warning
        # 2 == Existing Warning Alert
        # 3 == New Critical
        # 4+ == Existing Critical Alert
        # ====================
        # CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
        alert['alert_tags'][tag_count] += 1

        # ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
        # THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
        # OCCURRENCES SO RETURN IT
        if alert['alert_tags'][tag_count] >= occurrences_threshold:
            # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
            if alert['alert_tags'][tag] < 4:
                if is_warning_alarm and not is_critical_alarm:
                    # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
                    if alert['alert_tags'][tag] == 0:
                        # NEW WARNING
                        alert['alert_tags'][tag] = 1
                        logger.info("TestInfo: WARNING (NEW): {} - {}".format(
                            alert['id'], tag))
                    else:
                        # EXISTING WARNING
                        alert['alert_tags'][tag] = 2
                        logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
                            alert['id'], tag))
                if is_critical_alarm:
                    # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
                    if (alert['alert_tags'][tag] == 1 or
                            alert['alert_tags'][tag] == 2):
                        alert['alert_tags'][tag] = 3
                        logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
                            alert['id'], tag))
                    else:
                        # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
                        # LEVEL
                        if alert['alert_tags'][tag] < 3:
                            # NEW CRITICAL
                            alert['alert_tags'][tag] = 3
                            logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
                                alert['id'], tag))
                        else:
                            # EXISTING CRITICAL
                            alert['alert_tags'][tag] = 4
                            logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
                                alert['id'], tag))
            # RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
            # EVEN IF NOT ACTIVELY ALERTING ON IT
            if is_critical_alarm:
                current_alert_status = alert_status[3]
                send_metrics(alert, 2, result, current_alert_status)
                if availability:
                    logger.info("Sending availability stat 0")
                    send_metrics(alert, 0, result, 'service_level')
            if is_warning_alarm and not is_critical_alarm:
                current_alert_status = alert_status[1]
                send_metrics(alert, 1, result, current_alert_status)
                if availability:
                    logger.info("Sending availability stat 1")
                    send_metrics(alert, 1, result, 'service_level')
            logger.debug("{} alert for value {} of {} for tag {} has occurred "
                         "{} times. Threshold is >= {} times.".format(
                             current_alert_status,
                             value,
                             alert['id'],
                             tag,
                             alert['alert_tags'][tag_count],
                             occurrences_threshold))
        else:
            # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
            # CRITICAL) NEEDS TO BE FIRED
            logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
                         "threshold of {}".format(
                             value,
                             alert['id'],
                             tag,
                             alert['alert_tags'][tag_count],
                             occurrences_threshold))
            if availability:
                logger.info("Sending availability stat")
                send_metrics(alert, 1, result, 'service_level')
            return None

    logger.debug(
        "Alert {}->[{}]->{}, Occurrences={}".format(
            alert['id'], tag, current_alert_status,
            alert['alert_tags'][tag_count]))
    return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]


def check_kairosdb_alert(
        alert_config,
        service_config,
        logger,
        production_mode=True):
    from library.prom_api import PromAPI
    """
    Args:
        alert_config (dict): Config of the alert to run
        service_config (dict): Holds things like urls, tokens and other things
        logger (log object): does the logging
    Returns:
        None
    """
    availability = False
    # SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
    # START AT THE SAME TIME
    wait_time = random.randint(0, alert_config['interval'])
    logger.info(
        "ALERT_CONFIG: {}\tsleep: {}".format(
            alert_config['id'],
            wait_time))
    sleep(wait_time)
    # For metrics with availability set to true, we default the interval to 5
    # mins due Grafana limitations
    if 'availability' in alert_config and alert_config['availability']:
        availability = True
    # ====================
    # EACH CHECK JUST LOOPS
    # ====================
    ret = None
    while True:
        try:
            send_stat("check_run", 1, {'id': alert_config['id']})
            # BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
            query_url = os.path.join(
                service_config['kairosdb_url'] +
                "api/v1/datapoints/query")
            ret = requests.post(
                query_url,
                data=json.dumps(
                    alert_config['query']),
                timeout=service_config['timeout'])
            assert ret.status_code == 200

            # GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
            results = ret.json()['queries'][0]['results']
            logger.debug(
                "Got back {} results for alert {}".format(
                    len(results), alert_config['id']))
            log_alert_results(results, alert_config, logger)
            alert_list = []

            # LOOP THROUGH ALL THE RESULTS
            for r in results:
                alert_tags = (get_alert_tags(alert_config, r)
                              if has_custom_alert_routing(alert_config) else None)

                # OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
                # THEREIN AND EXAMINE FOR FAILURE
                if r['values']:
                    minvalue = min([x[1] for x in r['values']])
                    maxvalue = max([x[1] for x in r['values']])
                    # SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
                    # AN OBJECT
                    alert_list.append(
                        build_alert_message(
                            alert_config,
                            minvalue,
                            maxvalue,
                            r,
                            logger,
                            availability,
                            alert_tags=alert_tags))

                # THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
                # ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
                # AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
                # AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
                # LATER OCCURRENCE CAUSING A PREMATURE ALERT.
                # A NO-OP IF NO HISTORY.
                elif 'alert_tags' in alert_config:
                    for key in alert_config['alert_tags']:
                        if ('count' not in key and 'noresult' not in key and
                                alert_config['alert_tags'][key] > 0):
                            key_noresult = key + "_noresult"
                            key_count = key + "_count"
                            if alert_config['alert_tags'][key_noresult] > 10:
                                logger.info("{} occurrences of no results back "
                                            "for {}, clear out counts for tag '{}'".format(
                                                alert_config['alert_tags'][key_noresult],
                                                alert_config['id'], key))
                                alert_list.append(
                                    build_alert_message(
                                        alert_config,
                                        0,
                                        0,
                                        None,
                                        logger,
                                        availability,
                                        key,
                                        alert_tags=alert_tags))
                                alert_config['alert_tags'][key] = 0
                                alert_config['alert_tags'][key_count] = 0
                                alert_config['alert_tags'][key_noresult] = 0
                            else:
                                alert_config['alert_tags'][key_noresult] += 1
                                logger.info("{} occurrences of no results back "
                                            "for {}, tag '{}'".format(
                                                alert_config['alert_tags'][key_noresult],
                                                alert_config['id'], key))

            # SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
            alert_list = [x for x in alert_list if not x is None]
            set_firing(alert_config['id'], alert_list)
            clear_suppressed(alert_config, [alert[3] for alert in alert_list])
            for alert in alert_list :
                if production_mode or True:
                    send_alerts(
                        alert,
                        copy.deepcopy(alert_config),
                        service_config['victorops_url'],
                        service_config['slack_url'],
                        service_config['slack_token'],
                        service_config['smtp_server'],
                        service_config['sensu_endpoint'],
                        service_config['uchiwa_url'],
                        logger)
                else:
                    logger.info(
                        "Sending alert for: {}".format(
                            alert_config.get('id')))

        # HANDLE THE UNEXPECTED
        except TimeoutError:
            logger.error("Query [{}] took to long to run".format(
                alert_config['id']))
        except AssertionError:
            logger.error(
                "KairsoDB query failed: {}\n"
                "HTTP status code:\t{}\n"
                "Error Message:\t{}\nQuery:\n"
                "{}".format(
                    ret.url,
                    ret.status_code,
                    ret.text,
                    alert_config['query']))
        except gaierror:
            logger.error(
                "Unable to connect to smtp server: {}".format(
                    service_config['smtp_server']))
        except SuppressedException as e :
            logger.warn(
               "Skipping alert check {} as it's suppressed: {}".format(
                  alert_config['id'],
                  e
               ))
        except Exception as e:
            logger.error(
                "Unhandled exception {} on alert: {}".format(
                    str(e), alert_config['id']))
        finally:
            sleep(alert_config['interval'])

def stringify_alert_tags(alert_tags) :
   if not alert_tags :
      return "-"
   for i in alert_tags :
      if i.lower() == "dc" or i.lower() == "datacenter" :
         return str(alert_tags[i])
   return "-"

def is_suppressed(
   alert_config,
   alert_tags) :
   ret = False
   for dependency in alert_config['resolvedDependencies'].getDependencies() :
      if get_firing(dependency, alert_tags) :
         ret = True
   if not is_within_threshold(alert_config, alert_tags) :
      ret = False
   print("is_suppressed(", alert_config['id'], alert_tags, ") =", ret)
   return ret

def get_firing(id, alert_tags) :
   firing = "{}\\{}".format(id, stringify_alert_tags(alert_tags))
   try :
      redis = get_redis_client()
      status = redis.get(firing)
      if status and "decode" in dir(status) :
         status = status.decode()
      status = status if status else "ok"
      return status != "ok"
   except Exception as e :
      print(e)
      return False

def get_redis_client() :
   try :
      redis = redis.Redis()
      return redis
   except Exception :
      return MockRedis()

class MockRedis():
   def __init__(self) :
      return
   def get(self, key) :
      return None
   def set(self, key, value) :
      return
   def delete(self, key) :
      return
   def call(self, *args, **kwargs) :
      return []

def set_firing(id, active_fires) :
   prefix = "{}\\".format(id)
   previously_firing = list_firing(prefix)
   should_fire = []
   for active_fire in active_fires :
      alert_tags = active_fire[3]
      key = "{}{}".format(prefix, stringify_alert_tags(alert_tags))
      should_fire.append(key)
   for i in previously_firing :
      if not i in should_fire :
         set_not_firing(i)
   for i in should_fire :
      if not i in previously_firing :
         set_is_firing(i)

def list_firing(prefix) :
   try :
      redis = get_redis_client()
      resp = redis.call("KEYS", prefix+"*")
      return resp
   except Exception as e :
      return []

def set_not_firing(id) :
   redis = get_redis_client()
   redis.delete(id)

def set_is_firing(id) :
   redis = get_redis_client()
   redis.set(id, "bad")

def is_within_threshold(alert_config, alert_tags) :
   count = inc_suppressed(alert_config, alert_tags)
   threshold = alert_config['suppressed_occurrences_threshold'] if 'suppressed_occurrences_threshold' in alert_config else 9000000000000
   return count < threshold

def inc_suppressed(alert_config, alert_tags) :
   key = stringify_alert_tags(alert_tags)
   if not 'suppressed_occurrences' in alert_config :
      alert_config['suppressed_occurrences'] = {}
   if not key in alert_config['suppressed_occurrences'] :
      alert_config['suppressed_occurrences'][key] = 0
   alert_config['suppressed_occurrences'][key] += 1
   return alert_config['suppressed_occurrences'][key]

def clear_suppressed(alert_config, all_alert_tags) :
   for alert_tags in all_alert_tags:
      key = stringify_alert_tags(alert_tags)
      if not 'suppressed_occurrences' in alert_config :
         continue
      if not key in alert_config['suppressed_occurrences'] :
         continue
      del(alert_config['suppressed_occurrences'][key])

def check_prometheus_alert(
        alert_config,
        service_config,
        logger,
        production_mode=True):
    from library.prom_api import PromAPI
    """
    Args:
        alert_config (dict): Config of the alert to run
        service_config (dict): Holds things like urls, tokens and other things
        logger (log object): does the logging
    Returns:
        None
    """
    # SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
    # START AT THE SAME TIME
    num_dependencies = len(alert_config['resolvedDependencies'].getDependencies())
    wait_time = random.randint(num_dependencies, alert_config['interval'] + num_dependencies)
    logger.info(
        "ALERT_CONFIG: {}\tsleep: {}".format(
            alert_config['id'],
            wait_time))
    sleep(wait_time)
    # For metrics with availability set to true, we default the interval to 5
    # mins due to Grafana limitations
    availability = bool(alert_config.get('availability'))

    # ====================
    # EACH CHECK JUST LOOPS
    # ====================
    ret = None
    while True:
        try:
            send_stat("check_run", 1, {'id': alert_config['id']})
            prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
            ret = prom_api.query_range(
                query=alert_config['query'],
                start=alert_config['start_time'],
                end=alert_config['end_time'],
                duration=alert_config['interval'])

            assert ret['status'] == 'success'

            # GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
            results = ret['data']['result']
            logger.debug(
                "Got back {} results for alert {}".format(
                    len(results), alert_config['id']))
            log_alert_results(results, alert_config, logger)
            alert_list = []

            # LOOP THROUGH ALL THE RESULTS
            for r in results:
                alert_tags = (get_alert_tags(alert_config, r) if
                              has_custom_alert_routing(alert_config) else None)

                # REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
                r['tags'] = {key: [value]
                             for (key, value) in r['metric'].items()}

                # OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
                # THEREIN AND EXAMINE FOR FAILURE
                if r['values']:
                    raw_values = [value for _, value in r['values']]
                    min_value = float(min(raw_values))
                    max_value = float(max(raw_values))
                    # SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
                    # AN OBJECT
                    alert_list.append(
                        build_alert_message(
                            alert_config,
                            min_value,
                            max_value,
                            r,
                            logger,
                            availability,
                            alert_tags=alert_tags))

                # THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
                # WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
                # AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
                # BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
                # OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
                elif 'alert_tags' in alert_config:
                    for key in alert_config['alert_tags']:
                        if ('count' not in key and 'noresult' not in key and
                                alert_config['alert_tags'][key] > 0):
                            key_noresult = key + "_noresult"
                            key_count = key + "_count"
                            if alert_config['alert_tags'][key_noresult] > 10:
                                logger.info("{} occurrences of no results back "
                                            "for {}, clear out counts for tag '{}'".format(
                                                alert_config['alert_tags'][key_noresult],
                                                alert_config['id'], key))
                                alert_list.append(
                                    build_alert_message(
                                        alert_config,
                                        0,
                                        0,
                                        None,
                                        logger,
                                        availability,
                                        key,
                                        alert_tags=alert_tags))
                                alert_config['alert_tags'][key] = 0
                                alert_config['alert_tags'][key_count] = 0
                                alert_config['alert_tags'][key_noresult] = 0
                            else:
                                alert_config['alert_tags'][key_noresult] += 1
                                logger.info("{} occurrences of no results back "
                                            "for {}, tag '{}'".format(
                                                alert_config['alert_tags'][key_noresult],
                                                alert_config['id'], key))

            # SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
            alert_list = [x for x in alert_list if not x is None]
            set_firing(alert_config['id'], alert_list)
            clear_suppressed(alert_config, [alert[3] for alert in alert_list])
            for alert in alert_list :
                if production_mode or True:
                    send_alerts(
                        alert,
                        copy.deepcopy(alert_config),
                        service_config['victorops_url'],
                        service_config['slack_url'],
                        service_config['slack_token'],
                        service_config['smtp_server'],
                        service_config['sensu_endpoint'],
                        service_config['uchiwa_url'],
                        logger)
                else:
                    logger.info(
                        "Sending alert {}".format(
                            alert_config.get('id')))

        # HANDLE THE UNEXPECTED
        except TimeoutError:
            logger.error(
                "Query [{}] took to long to run".format(
                    alert_config['id']))
        except AssertionError:
            logger.error(
                "Prometheus query failed:\n"
                "Status:\t{}\n"
                "Error Type:\t{}\n"
                "Error Message:\t{}\n"
                "Query:\n{}".format(
                    ret['status'],
                    ret['errorType'],
                    ret['error'],
                    alert_config['query']))
        except gaierror:
            logger.error(
                "Unable to connect to smtp server: {}".format(
                    service_config['smtp_server']))
        except Exception as e:
            logger.error(
                "Unhandled exception {} on alert: {}".format(
                    str(e), alert_config['id']))
        finally:
            sleep(alert_config['interval'])


# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
def log_alert_results(results, alert_config, logger):
    """
    Logs the results broken out by tag provided in the alert_config to the
    logger for debuging
    Args:
        results: the results object returned from the call to kairosdb, of just
        the results
        alert_config: config object of the alert
        logger (log object): does the logging
    Returns:
        None, logs to logger
    """

    for v in results:
        logger.debug("{} - Result: {}".format(alert_config['id'], v))


def send_alerts(
        alert,
        alert_config,
        victorops_url,
        slack_url,
        slack_token,
        smtp_server,
        sensu_endpoint,
        uchiwa_url,
        logger):
    """
    Sends out the alerts to VO, Email, and/or Slack
    Args:
        alert: the alert tuple:
            alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
        alert_config: the alert configuration object
        victorops_url: url to victorops
        slack_url: url to slack api calls
        slack_token: the token for the alert
        smtp_server: The server to send mail messages too
        sensu_endpoint:
        uchiwa_url:
        logger (log object): does the logging
    Returns: None
    """
    # GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
    # USED
    tag_dict = dict()
    tag_dict['alert'] = alert_config['id']

    is_custom_alert_routing = has_custom_alert_routing(alert_config)
    if is_custom_alert_routing:
        alert_routing = alert_config.get('alert_routing_lookup', {})
        alert_config['alerts'] = alert_routing.get(
            alert[3], alert_config['alerts']['lookup']['default'])

    # once we move all alerts into sensu, we dont need to tho this
    if 'filters' in alert_config:
        logger.info(
            "alert_status : {}, alert_config: {}".format(
                alert[2], alert_config))
        if 'slack_subdue' in alert_config['filters'] and alert[2] in (
                1, 2) and alert_config['filters']['slack_subdue']:
            # unless the alert is critical we dont send it
            logger.info("Removed slack, alert_config: {}".format(alert_config))
            alert_config['alerts'].pop('slack', None)
        if ('victorops_subdue' in alert_config['filters'] and
                alert[2] in (1, 2) and
                alert_config['filters']['victorops_subdue']):
            # unless the alert is critical we dont send it
            alert_config['alerts'].pop('vo', None)
            logger.info("Removed vo, alert_config: {}".format(alert_config))

    # ====================
    # VICTOROPS HANDLING
    # ====================
    if 'vo' in alert_config['alerts'] and not is_suppressed(alert_config, alert[3]) :
        for notify in alert_config['alerts']['vo']:
            payload = dict(entity_id=alert[0],
                           message_type=alert_status[alert[2]],
                           state_message=alert[1])
            r = None
            try:
                r = requests.post(
                    victorops_url + notify,
                    data=json.dumps(payload),
                    headers={
                        "Content-type": "application-json"})
                assert r.status_code == 200
                # Record a VO alert sent event
                tag_dict['alert_channel_type'] = "VictorOps"
                tag_dict['who'] = "vo:{}".format(notify)
                send_stat("alert_channel", 1, tag_dict)
                # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
            except AssertionError:
                logger.error(
                    "Post to VO failed for {}\n{}:\t{}".format(
                        alert_config['id'], r.status_code, r.text))
            except Exception as e:
                logger.error("Unhandled exception for alert_id:{} "
                             "when posting to VO: {}".format(
                                 alert_config['id'], str(e)))

    # ====================
    # EMAIL HANDLING
    # ====================
    if 'email' in alert_config['alerts'] and (
            alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
        msg = MIMEText(alert[1])
        msg['Subject'] = '{} Status: {}'.format(
            alert[0], alert_status[alert[2]])
        msg['From'] = 'aom@qualtrics.com'
        msg['To'] = ','.join(
            [x + "@qualtrics.com" for x in alert_config['alerts']['email']])
        try:
            s = smtplib.SMTP(smtp_server)
            s.send_message(msg)
            s.quit()
            # Record an Email alert sent event
            tag_dict['alert_channel_type'] = "Email"
            tag_dict['who'] = "email:{}".format(msg['To'])
            send_stat("alert_channel", 1, tag_dict)
            # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
        except Exception as e:
            logger.error(
                "Unhandled exception when sending mail for {} to {}\n{}".format(
                    alert_config['id'], smtp_server, str(e)))

    # ====================
    # SENSU HANDLING
    # ====================
    if 'sensu' in alert_config['alerts']:
        # Dictionary with static values for Sensu
        sensu_dict = {
            'source': 'AOM',
            'refresh': 3600,
            'occurrences': 1,
            'name': alert_config['id']+'__'+alert[4]}
        # if alert[3]:
        #     logger.info(alert)
        #     sensu_dict['name'] = '_'.join(
        #         [alert_config['id']] + sorted(list(alert[3])))
        if 'refresh' in alert_config:
            sensu_dict['refresh'] = alert_config['refresh']
        sensu_dict['interval'] = alert_config['interval']
        sensu_dict['handlers'] = []
        sensu_dict['dashboard'] = alert_config['url']
        if 'dependencies' in alert_config['alerts']['sensu'].keys():
            sensu_dict['dependencies'] = (alert_config['alerts']
                                          ['sensu']['dependencies'])
        if 'victorops' in alert_config['alerts']['sensu'].keys() and not is_suppressed(alert_config, alert[3]) :
            sensu_dict['handlers'].append("victorops")
            sensu_dict['routing_key'] = (alert_config['alerts']
                                         ['sensu']['victorops'])
        # # Leave this here until we have email support in Sensu
        # if 'email' in alert_config['alerts']['sensu'].keys():
        #     sensu_dict['handlers'].append("email")
        #     # verify this option
        #     sensu_dict['email'] = alert_config['alerts']['sensu']['email']
        if 'slack' in alert_config['alerts']['sensu'].keys():
            sensu_dict['handlers'].append("slack")
            sensu_dict['slack_channel'] = (
                alert_config['alerts']['sensu']['slack'])
            # Format alert message
            sensu_dict['dashboard'] = (
                "<{}|here> , Uchiwa: <{}?check={}|here> ".format(
                    alert_config['url'], uchiwa_url, alert_config['id']))
        if 'jira' in alert_config['alerts']['sensu'].keys():
            sensu_dict['handlers'].append("jira")
            sensu_dict.update(alert_config['alerts']['sensu']['jira'])
        if 'filters' in alert_config:
            sensu_dict['filters'] = alert_config['filters']
        # 0 = OK, 1 = WARNING, 2 = CRITICAL
        sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
        sensu_dict['status'] = sensu_status[alert[2]]
        sensu_dict['output'] = alert[1]

        r = None
        try:
            user = os.environ['API_USER']
            passwd = os.environ['API_PASS']
            r = requests.post(
                sensu_endpoint,
                json.dumps(sensu_dict),
                auth=(
                    user,
                    passwd))
            assert r.status_code == 202
        except AssertionError:
            logger.error(
                "Post to Sensu failed  {}\n{}:\t{}".format(
                    alert_config['id'],
                    r.status_code,
                    r.text))
        except Exception as e:
            logger.error("Unhandled exception for alert_id:{} "
                         "when posting to Sensu: {}".format(
                             alert_config['id'], str(e)))

    # ====================
    # SLACK HANDLING - all Slack alerts will go through Sensu
    # ====================
    if 'slack' in alert_config['alerts'] and (
            alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
        refresh = alert_config.get('refresh', 3600)
        dashboard = alert_config.get('url', '')
        sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
        sensu_dict2 = {'handlers': ['slack'],
                       'interval': alert_config['interval'],
                       'source': 'AOM',
                       'refresh': refresh,
                       'occurrences': 1,
                       'name': alert_config['id']+'__'+alert[4],
                       'dashboard': dashboard,
                       'status': sensu_status[alert[2]],
                       'output': alert[1]}
        if is_custom_alert_routing:
            sensu_dict2['name'] = '_'.join(
                [alert_config['id']] + list(alert[3]))
        sensu_dict2['dashboard'] = (
            "<{}|here> , Uchiwa: <{}?check={}|here> ".format(
                alert_config['url'], uchiwa_url, alert_config['id']))
        for channel in alert_config['alerts']['slack']:
            sensu_dict2['slack_channel'] = channel
            r = None
            try:
                user = os.environ['API_USER']
                passwd = os.environ['API_PASS']
                r = requests.post(
                    sensu_endpoint,
                    json.dumps(sensu_dict2),
                    auth=(
                        user,
                        passwd))
                assert r.status_code == 202
            except AssertionError:
                logger.error(
                    "Post to Sensu failed  {}\n{}:\t{}".format(
                        alert_config['id'], r.status_code, r.text))
            except Exception as e:
                logger.error("Unhandled exception for alert_id:{} when posting"
                             "to Sensu: {}".format(alert_config['id'], str(e)))

# payload = dict(token=slack_token, channel=channel,
#                text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
# r = None
# try:
#     r = requests.post(slack_url, data=payload)
#     assert r.status_code == 200
#     # Record an Slack alert sent event
#     tag_dict['alert_channel_type'] = "Slack"
#     tag_dict['who'] = "slack:{}".format(channel)
#     send_stat("alert_channel", 1, tag_dict)
#     # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
# except AssertionError:
#     logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
# except Exception as e:
#     logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
# str(e)))


def send_metrics(alert, value, result, gaugename='stats'):
    """
    Sends the results from the alert check to statsd
    Args:
        alert: The Alert config object that holds the alert['tag'] value.
        gaugename: The name of the gauge metric we send.
        value: The value we want to send as a gauge.
        result: The result object from making the call. Use the data in this
        object to tag the metric.
    Returns: None
    """
    # GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
    # SPECIFIC ALERTS
    result_tags = list(itertools.chain(
        *[result['tags'][x] for x in alert['tags']]))
    tag_dict = dict()
    for x in range(len(alert['tags'])):
        tag_dict[alert['tags'][x]] = result_tags[x]
    tag_dict['alert'] = alert['id']

    # SEND THE METRIC
    send_stat(gaugename, value, tag_dict)


def send_stat(gaugename, value, tag_dict, statprefix='aom'):
    """Sends stats value to statsd"""
    client = StatsClient('telegraf', 8125, statprefix)

    # SUBMIT STATS
    client.gauge(gaugename, value, tags=tag_dict)


def has_custom_alert_routing(alert_config):
    """Checks if alert has custom routing"""
    return 'lookup' in alert_config['alerts']


def get_alert_tags(alert_config, query_result):
    """Retrieves custom tags from alert"""
    query_tags = {}
    for tag in alert_config['alerts']['lookup']['tags']:
        if (alert_config.get('query_type') == 'prometheus' and
                'metric' in query_result and
                tag in query_result['metric']):
            query_tags[tag] = query_result['metric'][tag]
        elif ('tags' in query_result and tag in query_result['tags']
              and query_result['tags'][tag]):
            query_tags[tag] = query_result['metric'][tag][0]
    return query_tags

class SuppressedException(Exception) :
   pass