QVolution2019.2/sleeper_agents_aom_engine/library/config.py

# config.py
"""Functions for loading alert configuration files"""
import glob
import os
import json
import hashlib
import yaml
import requests
import traceback

# import logging
# logger = logging.getLogger(__name__)

DEPENDENCIES_KEY = 'dependencies'

class AlertWithDependencies:
    def __init__(self, alertId, dependencies):
        self.alertId = alertId
        self.beenProcessed = False
        self.dependencies = []
        self.addAllDependencies(dependencies)

    def addAllDependencies(self, moreDependencies):
        if moreDependencies is not None:
            self.dependencies.extend(moreDependencies)

    def getDependencies(self):
        return self.dependencies

    def getAlertId(self):
        return self.alertId

    def hasBeenProcessed(self):
        return self.beenProcessed

    def visit(self):
        self.beenProcessed = True

def md5(fname):
    """Calculates md5 hash of a filename"""
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def get_healthy_nodes_and_index(consul_url, hostname, logger):
    """Find AOM healthy nodes on consult"""
    try:
        # getting all registered nodes from consul
        r = requests.get(
            consul_url +
            '/v1/catalog/service/alert-on-metrics',
            timeout=60)
        assert r.status_code == 200, "Failed to get back a 200 from consul catalog"

        value = json.loads(r.text)
        node_list = []
        host_index = -1
        for elem in value:
            node_list.append(elem.get('Node'))

        # Retrieving healthy nodes
        healthy_nodes = []
        for node in node_list:
            r2 = requests.get(
                consul_url +
                '/v1/health/node/' +
                node,
                timeout=60)
            assert r.status_code == 200, "Failed to get back a 200 from consul health"
            healthcheck_list = json.loads(r2.text)
            for check in healthcheck_list:
                if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
                        check.get('Status') == 'passing'):
                    healthy_nodes.append(node)

        try:
            healthy_nodes.sort()
            host_index = healthy_nodes.index(hostname)
        except ValueError:
            logger.error("Host is not healthy")
    except TimeoutError:
        logger.error("Timed out connecting to Consul")
    return host_index, len(healthy_nodes)


def distribute_configs(
        filename,
        host_index,
        module,
        logger):
    """Uses md5 of alert config to split the files among healthy servers"""
    if module == 0:
        logger.error("No healthy nodes for the service")
        return False
    if host_index == -1:
        logger.error("Host is unhealthy")
        return False
    if int(md5(filename), 16) % module == host_index:
        return True
    return False


def is_valid(alert_config, logger):
    """Checks if alert has all required fields"""
    try:
        assert alert_config['alerts'], "No Alerts configured, this is a dead config"
        assert alert_config['query'], "No Query, this is a dead config"
        #assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
        assert alert_config['id'], "Alert ID is empty, this is a dead config"
        if DEPENDENCIES_KEY in alert_config:
            assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
        if alert_config.get('query_type') == 'prometheus':
            assert isinstance(
                alert_config['query'], str), "Invalid Prometheus query"
        else:
            assert isinstance(
                alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
            defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
                {'', 'dc', 'fqdn'})
            # IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
            if 'group_by' in alert_config['query']['metrics'][0]:
                defined_tags.update(
                    set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
            # for undefined_tag in set(alert_config['tags']).difference(defined_tags):
            #     print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
            #           "prevent empty results".format(undefined_tag))
            # OUR MINIMUM THRESHOLD NEED
        assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
            'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
            "Config must have at least one threshold set."

        # JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
        # AFTER CRITICAL
        if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
            assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
                "Lower Critical must be less than Lower Warning"
        if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
            assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
                "Upper Critical must be greater than Upper Warning"

        if 'lookup' in alert_config['alerts']:
            assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
            assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
                'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
            assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
            assert all(
                isinstance(
                    tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'

        # if 'occurrences_threshold' in alert_config:
        #     assert alert_config['occurrences_threshold'] >= 1, \
        #         "Having an occurrences value less than 2 is assumed and pointless to specify"
    except Exception as e:
        logger.warning("Invalid config file: {}".format(str(e)))
        return False
    return True


def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
    """Check if routing lookup is properly configured"""
    try:
        assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
        for alert_routing in alert_routing_lookup:
            assert 'alert' in alert_routing, "No alert defined for this configuration."
            assert 'tags' in alert_routing, "No tags value defined for this configuration."
            for tag in alert_routing['tags']:
                assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
                    tag)
            assert all(isinstance(tag, str)
                       for tag in alert_routing['tags']), "Tags must be valid string"
    except AssertionError as e:
        logger.warning("Invalid alert routing config file: {}".format(str(e)))
        return False
    return True


# noinspection PyBroadException
def glob_the_configs(
        config_path,
        lookup_config_path,
        consul_url,
        hostname,
        logger):
    """
    Args:
        config_path (string): relative path to the configs
        consul_url (string): url to consul service
        logger:
    Returns:
        List of configs
    """
    invalid_configs = 0
    alert_list = []
    host_index, module = get_healthy_nodes_and_index(
        consul_url, hostname, logger)
    alertToAlertWithDependencies = {}
    for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
        logger.debug("Found {} config".format(config_file))
        # LOAD CONFIG
        if distribute_configs(
                config_file,
                host_index,
                module,
                logger):
            try:
                alert = yaml.safe_load(open(config_file, 'rb').read())
                if is_valid(alert, logger):
                    if 'lookup' in alert['alerts']:
                        alert_routing_lookup = []
                        is_valid_lookup = True
                        if 'lookup_file' in alert['alerts']['lookup']:
                            lookup_path = "{}/{}".format(
                                lookup_config_path, alert['alerts']['lookup']['lookup_file'])
                            if os.path.isfile(lookup_path):
                                alert_routing_lookup = yaml.safe_load(
                                    open(lookup_path, 'rb').read())
                            else:
                                is_valid_lookup = False
                        else:
                            alert_routing_lookup = alert['alerts']['lookup']['lookups']

                        is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
                            alert_routing_lookup, alert, logger)

                        if is_valid_lookup:
                            alerts_per_tags = {}
                            for alert_configuration in alert_routing_lookup:
                                key = []
                                for tag in alert['alerts']['lookup']['tags']:
                                    key.append(
                                        alert_configuration['tags'].get(tag))
                                alerts_per_tags[tuple(
                                    key)] = alert_configuration['alert']
                            alert['alert_routing_lookup'] = alerts_per_tags
                        else:
                            invalid_configs += 1
                            continue
                    alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
                    alertToAlertWithDependencies[alert['id']] = alertWithDependencies
                    alert['resolvedDependencies'] = alertWithDependencies
                    alert_list.append(alert)
                else:
                    invalid_configs += 1
            except BaseException:
                logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
    # validate the dependencies and flesh out the dependency graphs
    logger.debug("Iterating over dependencies")
    for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
        validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)

    logger.info("Invalid configs: {}".format(invalid_configs))
    from serviceapp import service
    service.send_stat(
        'invalid_configs',
        invalid_configs,
        dict(),
        statprefix='aom')
    logger.info("Loaded {} configs".format(len(alert_list)))
    return alert_list

def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
    if len(alertWithDependencies.getDependencies()) > 0:
        if not alertWithDependencies.hasBeenProcessed():
            alertWithDependencies.visit()
            dependencies = list(alertWithDependencies.getDependencies())
            for dependentId in dependencies:
                if dependentId not in allAlerts:
                    logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
                else:
                    alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
        logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
        return alertWithDependencies.getDependencies()
    else:
        return None