cold

2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions
--- a/sleeper_agents_aom_engine/library/init.py
+++ b/sleeper_agents_aom_engine/library/init.py
--- a/sleeper_agents_aom_engine/library/args.py
+++ b/sleeper_agents_aom_engine/library/args.py
@@ -0,0 +1,163 @@
+# Contians the arg parser options.
+"""Contains the arg parser options."""
+
+
+import argparse
+import sys
+
+
+def get_builder_args():
+    """
+    Gets the arguments passed in to the aom_builder main call
+
+    :return: parser object
+    """
+    parser = argparse.ArgumentParser(
+        description="Generates a valid yaml file "
+        "for alerting on metrics. If you are "
+        "familiar with the yaml structure for an "
+        "alert you don't have to use this builder,"
+        " it's just convenient")
+    parser.add_argument('-q', '--query', help="The Kariosdb query string to "
+                        "use")
+    parser.add_argument(
+        '-i', '--interval', type=int, default=60, help="The "
+        "interval that the check will This value is in seconds")
+    parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
+                        "upper threshold is the value that when reached will "
+                        "cause an depending on the threshold logic. "
+                        "Use in conjunction with lower threshold to define a "
+                        "normal band.")
+    parser.add_argument(
+        '-b',
+        '--lowerthreshold',
+        help="The lower threshold is the value that when reached will cause an "
+        "alert depending on the threshold logic"
+        "Use in conjunction with upper threshold to define a normal band.")
+    parser.add_argument(
+        '-m',
+        '--measure',
+        choices=[
+            'gt',
+            'lt',
+            'eq'],
+        help="The measure to use to compare the "
+        "threshold to the values of the alerts")
+    parser.add_argument(
+        '-a',
+        '--alert_config',
+        help='A valid Yaml representation of your alerting block')
+    parser.add_argument(
+        '-l',
+        '--log_level',
+        type=int,
+        default=0,
+        help="The log level for the aom_builder run. "
+        "[0=Error, 1=Info, 2=Debug]")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=8080,
+        help="The port to run the webapp on")
+
+    return args_to_dict(parser)
+
+
+def get_tester_service_args():
+    """
+    Gets arguments passed into aom_tester.py
+    Returns: parser object
+    """
+    parser = argparse.ArgumentParser(
+        description="Parameters to start the alerting on metrics dummy tester "
+        "service")
+    parser.add_argument(
+        '-l',
+        '--log_level',
+        type=int,
+        default=0,
+        help="The log level for the aom_service app"
+        "[0=Error, 1=Info, 2=Debug]")
+    parser.add_argument(
+        '-a',
+        '--alert_configs',
+        default=None,
+        help="If provided will override the folder location read from the "
+        "config with the value passed in. Is helpful for testing and "
+        "troubleshooting alerts")
+    parser.add_argument(
+        '--hostname',
+        help="If provided, will override the actual hostname check with this "
+        "value")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=8080,
+        help="The port to run the webapp on")
+    return args_to_dict(parser)
+
+
+def get_service_args():
+    """
+    Gets arguments passed into aom_service.py
+    Returns: parser object
+    """
+    parser = argparse.ArgumentParser(
+        description="Parameters to start the alerting on metrics service")
+    parser.add_argument(
+        '-l',
+        '--log_level',
+        type=int,
+        default=0,
+        help="The log level for the aom_service app"
+        "[0=Error, 1=Info, 2=Debug]")
+    parser.add_argument(
+        '-a',
+        '--alert_configs',
+        default=None,
+        help="If provided will override the folder location read from the "
+        "config with the value passed in. Is helpful for testing and "
+        "troubleshooting alerts")
+    parser.add_argument(
+        '--alert_routing_lookup',
+        default=None,
+        help="If provided will override the folder used to fetch the alerts "
+        "lookup configuration.")
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='store_true',
+        help="Overrides the check leader election value")
+    parser.add_argument(
+        '--hostname',
+        help="If provided, will override the actual hostname check with this "
+        "value")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=8080,
+        help="The port to run the webapp on")
+    return args_to_dict(parser)
+
+
+def args_to_dict(parsed_args):
+    """
+    Converts the argument parser object to a dict
+    Args:
+        parsed_args: Arg parser object
+    Returns:
+        Dictionary of arguments
+    """
+    try:
+        arg_list = parsed_args.parse_args()
+        # RETURN A DICT OF ARGUMENTS
+        arg_dict = dict()
+        for val in vars(arg_list):
+            arg_dict[val] = getattr(arg_list, val)
+        return arg_dict
+    except argparse.ArgumentError:
+        parsed_args.print_help()
+        sys.exit(1)
--- a/sleeper_agents_aom_engine/library/config.py
+++ b/sleeper_agents_aom_engine/library/config.py
@@ -0,0 +1,277 @@
+# config.py
+"""Functions for loading alert configuration files"""
+import glob
+import os
+import json
+import hashlib
+import yaml
+import requests
+import traceback
+
+# import logging
+# logger = logging.getLogger(__name__)
+
+DEPENDENCIES_KEY = 'dependencies'
+
+class AlertWithDependencies:
+    def __init__(self, alertId, dependencies):
+        self.alertId = alertId
+        self.beenProcessed = False
+        self.dependencies = []
+        self.addAllDependencies(dependencies)
+
+    def addAllDependencies(self, moreDependencies):
+        if moreDependencies is not None:
+            self.dependencies.extend(moreDependencies)
+
+    def getDependencies(self):
+        return self.dependencies
+
+    def getAlertId(self):
+        return self.alertId
+
+    def hasBeenProcessed(self):
+        return self.beenProcessed
+
+    def visit(self):
+        self.beenProcessed = True
+
+def md5(fname):
+    """Calculates md5 hash of a filename"""
+    hash_md5 = hashlib.md5()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def get_healthy_nodes_and_index(consul_url, hostname, logger):
+    """Find AOM healthy nodes on consult"""
+    try:
+        # getting all registered nodes from consul
+        r = requests.get(
+            consul_url +
+            '/v1/catalog/service/alert-on-metrics',
+            timeout=60)
+        assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
+
+        value = json.loads(r.text)
+        node_list = []
+        host_index = -1
+        for elem in value:
+            node_list.append(elem.get('Node'))
+
+        # Retrieving healthy nodes
+        healthy_nodes = []
+        for node in node_list:
+            r2 = requests.get(
+                consul_url +
+                '/v1/health/node/' +
+                node,
+                timeout=60)
+            assert r.status_code == 200, "Failed to get back a 200 from consul health"
+            healthcheck_list = json.loads(r2.text)
+            for check in healthcheck_list:
+                if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
+                        check.get('Status') == 'passing'):
+                    healthy_nodes.append(node)
+
+        try:
+            healthy_nodes.sort()
+            host_index = healthy_nodes.index(hostname)
+        except ValueError:
+            logger.error("Host is not healthy")
+    except TimeoutError:
+        logger.error("Timed out connecting to Consul")
+    return host_index, len(healthy_nodes)
+
+
+def distribute_configs(
+        filename,
+        host_index,
+        module,
+        logger):
+    """Uses md5 of alert config to split the files among healthy servers"""
+    if module == 0:
+        logger.error("No healthy nodes for the service")
+        return False
+    if host_index == -1:
+        logger.error("Host is unhealthy")
+        return False
+    if int(md5(filename), 16) % module == host_index:
+        return True
+    return False
+
+
+def is_valid(alert_config, logger):
+    """Checks if alert has all required fields"""
+    try:
+        assert alert_config['alerts'], "No Alerts configured, this is a dead config"
+        assert alert_config['query'], "No Query, this is a dead config"
+        #assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
+        assert alert_config['id'], "Alert ID is empty, this is a dead config"
+        if DEPENDENCIES_KEY in alert_config:
+            assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
+        if alert_config.get('query_type') == 'prometheus':
+            assert isinstance(
+                alert_config['query'], str), "Invalid Prometheus query"
+        else:
+            assert isinstance(
+                alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
+            defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
+                {'', 'dc', 'fqdn'})
+            # IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
+            if 'group_by' in alert_config['query']['metrics'][0]:
+                defined_tags.update(
+                    set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
+            # for undefined_tag in set(alert_config['tags']).difference(defined_tags):
+            #     print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
+            #           "prevent empty results".format(undefined_tag))
+            # OUR MINIMUM THRESHOLD NEED
+        assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
+            'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
+            "Config must have at least one threshold set."
+
+        # JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
+        # AFTER CRITICAL
+        if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
+            assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
+                "Lower Critical must be less than Lower Warning"
+        if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
+            assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
+                "Upper Critical must be greater than Upper Warning"
+
+        if 'lookup' in alert_config['alerts']:
+            assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
+            assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
+                'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
+            assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
+            assert all(
+                isinstance(
+                    tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
+
+        # if 'occurrences_threshold' in alert_config:
+        #     assert alert_config['occurrences_threshold'] >= 1, \
+        #         "Having an occurrences value less than 2 is assumed and pointless to specify"
+    except Exception as e:
+        logger.warning("Invalid config file: {}".format(str(e)))
+        return False
+    return True
+
+
+def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
+    """Check if routing lookup is properly configured"""
+    try:
+        assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
+        for alert_routing in alert_routing_lookup:
+            assert 'alert' in alert_routing, "No alert defined for this configuration."
+            assert 'tags' in alert_routing, "No tags value defined for this configuration."
+            for tag in alert_routing['tags']:
+                assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
+                    tag)
+            assert all(isinstance(tag, str)
+                       for tag in alert_routing['tags']), "Tags must be valid string"
+    except AssertionError as e:
+        logger.warning("Invalid alert routing config file: {}".format(str(e)))
+        return False
+    return True
+
+
+# noinspection PyBroadException
+def glob_the_configs(
+        config_path,
+        lookup_config_path,
+        consul_url,
+        hostname,
+        logger):
+    """
+    Args:
+        config_path (string): relative path to the configs
+        consul_url (string): url to consul service
+        logger:
+    Returns:
+        List of configs
+    """
+    invalid_configs = 0
+    alert_list = []
+    host_index, module = get_healthy_nodes_and_index(
+        consul_url, hostname, logger)
+    alertToAlertWithDependencies = {}
+    for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
+        logger.debug("Found {} config".format(config_file))
+        # LOAD CONFIG
+        if distribute_configs(
+                config_file,
+                host_index,
+                module,
+                logger):
+            try:
+                alert = yaml.safe_load(open(config_file, 'rb').read())
+                if is_valid(alert, logger):
+                    if 'lookup' in alert['alerts']:
+                        alert_routing_lookup = []
+                        is_valid_lookup = True
+                        if 'lookup_file' in alert['alerts']['lookup']:
+                            lookup_path = "{}/{}".format(
+                                lookup_config_path, alert['alerts']['lookup']['lookup_file'])
+                            if os.path.isfile(lookup_path):
+                                alert_routing_lookup = yaml.safe_load(
+                                    open(lookup_path, 'rb').read())
+                            else:
+                                is_valid_lookup = False
+                        else:
+                            alert_routing_lookup = alert['alerts']['lookup']['lookups']
+
+                        is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
+                            alert_routing_lookup, alert, logger)
+
+                        if is_valid_lookup:
+                            alerts_per_tags = {}
+                            for alert_configuration in alert_routing_lookup:
+                                key = []
+                                for tag in alert['alerts']['lookup']['tags']:
+                                    key.append(
+                                        alert_configuration['tags'].get(tag))
+                                alerts_per_tags[tuple(
+                                    key)] = alert_configuration['alert']
+                            alert['alert_routing_lookup'] = alerts_per_tags
+                        else:
+                            invalid_configs += 1
+                            continue
+                    alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
+                    alertToAlertWithDependencies[alert['id']] = alertWithDependencies
+                    alert['resolvedDependencies'] = alertWithDependencies
+                    alert_list.append(alert)
+                else:
+                    invalid_configs += 1
+            except BaseException:
+                logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
+    # validate the dependencies and flesh out the dependency graphs
+    logger.debug("Iterating over dependencies")
+    for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
+        validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)
+
+    logger.info("Invalid configs: {}".format(invalid_configs))
+    from serviceapp import service
+    service.send_stat(
+        'invalid_configs',
+        invalid_configs,
+        dict(),
+        statprefix='aom')
+    logger.info("Loaded {} configs".format(len(alert_list)))
+    return alert_list
+
+def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
+    if len(alertWithDependencies.getDependencies()) > 0:
+        if not alertWithDependencies.hasBeenProcessed():
+            alertWithDependencies.visit()
+            dependencies = list(alertWithDependencies.getDependencies())
+            for dependentId in dependencies:
+                if dependentId not in allAlerts:
+                    logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
+                else:
+                    alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
+        logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
+        return alertWithDependencies.getDependencies()
+    else:
+        return None
--- a/sleeper_agents_aom_engine/library/logger.py
+++ b/sleeper_agents_aom_engine/library/logger.py
@@ -0,0 +1,122 @@
+# logger.py
+""" Logging configuration """
+
+
+import logging
+import logging.handlers
+import os
+
+logging.getLogger('requests').setLevel(logging.ERROR)
+logging.getLogger('urllib3').setLevel(logging.ERROR)
+logging.getLogger('werkzeug').setLevel(logging.ERROR)
+
+
+class SingleLevelFilter(logging.Filter):
+    def __init__(self, passlevel, reject):
+        """
+        initilizer(constructor) of the singlelevelfilter
+        @param passlevel (int) - the int value of the level of the log
+        @param reject (bool) - if true will return if the record level is
+            not equal to the passlevel
+        @return SingleLevelFilter object
+        @note Sets some object parameters
+        """
+        self.passlevel = passlevel
+        self.reject = reject
+
+    def filter(self, record):
+        """
+        Returns True/False depending on parameters
+        @param record (Log int) - the record that the filter belongs to
+        @return bool - True/False depending on what self.reject is set to and
+                       what record.levelno and self.passlevel are set to
+        @note This causes either only logging of the exact same level to get
+              logged, or only logging other than the same level to get logged
+        """
+        if self.reject:
+            return record.levelno != self.passlevel
+        return record.levelno == self.passlevel
+
+
+class AlertLogging(logging.Logger):
+    """
+    Class Object to handle the logging of the alert on metrics service
+    starts at Error level and can flip on (and add) an additional log file and
+    Debug logger as needed.
+    """
+
+    def __init__(self, name):
+        """
+        Inits the formaters and logger
+        """
+        self.name = name
+        self.debug_formatter = logging.Formatter(
+            "%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
+            "%(message)s", "%m-%d %H:%M:%S")
+
+        self.standard_formatter = logging.Formatter(
+            "%(asctime)s - [%(levelname)s] -  %(message)s", "%m-%d %H:%M:%S")
+        logging.getLogger()
+        logging.Logger.__init__(self, name, logging.DEBUG)
+        logging.setLoggerClass(AlertLogging)
+
+    def start(self):
+        """
+
+        Returns:
+
+        """
+        info_handler = logging.StreamHandler()
+        info_handler.setLevel(logging.INFO)
+        info_handler.setFormatter(self.standard_formatter)
+        self.addHandler(info_handler)
+        return self
+
+    def start_log_file(self, file_path, mode='a'):
+        """
+        Creates a separate log file handler
+        Args:
+            file_path: path to the log file
+            mode: the type of mode to open the file handler with
+        Returns:
+
+        """
+        self.log_path = file_path
+        work_folder = os.path.dirname(file_path)
+        if work_folder and not os.path.exists(work_folder):
+            os.makedirs(work_folder)
+        self.log_handler = logging.FileHandler(file_path, mode)
+        self.log_handler.setLevel(logging.WARNING)
+        self.log_handler.setFormatter(self.debug_formatter)
+        self.addHandler(self.log_handler)
+
+    def stop_log_file(self):
+        """
+        Closes Log file and sets the handler to None
+        Returns:
+
+        """
+        self.log_handler.close()
+        self.removeHandler(self.log_handler)
+        self.log_handler = None
+
+    def start_debug(self):
+        """
+
+        Returns:
+
+        """
+        self.debug_handler = logging.StreamHandler()
+        self.debug_handler.setLevel(logging.DEBUG)
+        self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
+        self.debug_handler.setFormatter(self.debug_formatter)
+        self.addHandler(self.debug_handler)
+
+    def stop_debug(self):
+        """
+        stop the debugger
+        Returns:
+
+        """
+        self.removeHandler(self.debug_handler)
+        self.debug_handler = None
--- a/sleeper_agents_aom_engine/library/prom_api.py
+++ b/sleeper_agents_aom_engine/library/prom_api.py
@@ -0,0 +1,83 @@
+from datetime import datetime, timedelta
+from urllib.parse import urljoin
+
+import requests
+
+
+class PromAPI:
+    def __init__(self, endpoint='http://127.0.0.1:9090/'):
+        """
+        :param endpoint: address of
+        """
+        self.endpoint = endpoint
+
+    @staticmethod
+    def _to_timestamp(input_):
+        """
+        Convert string input to UNIX timestamp for Prometheus
+        :param input_:
+        :return:
+        """
+        if type(input_) == datetime:
+            return input_.timestamp()
+        if input_ == 'now':
+            return datetime.utcnow().isoformat('T')
+        if type(input_) is str:
+            input_ = float(input_)
+        if type(input_) in [int, float]:
+            if input_ > 0:
+                return input_
+            if input_ == 0:  # return now
+                return datetime.utcnow().isoformat('T')
+            if input_ < 0:
+                return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
+        #assert type(input_) == float
+
+    def query(self, query='prometheus_build_info'):
+        return self._get(
+            uri='/api/v1/query',
+            params=dict(
+                query=query
+            )
+        )
+
+    def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
+        """Get ser"""
+        params = {
+            'query': query
+        }
+        if end is not None:
+            params['end'] = self._to_timestamp(end) + 'Z'
+        if start:
+            params['start'] = self._to_timestamp(start) + 'Z'
+        if duration:
+            params['step'] = duration
+        print(params)
+        return self._get(
+            uri='/api/v1/query_range',
+            params=params
+        )
+
+    def series(self, match='prometheus_build_info', start=-86400, end='now'):
+        """Get ser"""
+        params = {
+            'match[]': match
+        }
+        if end is not None:
+            params['end'] = self._to_timestamp(end) + 'Z'
+        if start:
+            params['start'] = self._to_timestamp(start) + 'Z'
+        print(params)
+        return self._get(
+            uri='/api/v1/series',
+            params=params
+        )
+
+    def _get(self, uri, params, method='GET'):
+        url = urljoin(self.endpoint, uri)
+        assert method == 'GET'
+        result = requests.get(
+            url=url,
+            params=params
+        )
+        return result.json()
--- a/sleeper_agents_aom_engine/library/test_config.py
+++ b/sleeper_agents_aom_engine/library/test_config.py
@@ -0,0 +1,47 @@
+import unittest
+import config
+
+class TestAlertWithDependencies(unittest.TestCase) :
+   def test_base(self) :
+      self.alertToAlertWithDependencies = {}
+      self.alert_list = []
+      self.make_alert("A", ["C"])
+      self.make_alert("B", ["C"])
+      self.make_alert("C", ["D"])
+      self.make_alert("D", None)
+      self.validate()
+      self.checkDepLen("A", 2)
+      self.checkDepLen("B", 2)
+      self.checkDepLen("C", 1)
+      self.checkDepLen("D", 0)
+
+   def make_alert(self, id, depends) :
+      alert = {
+         'id': id,
+         'dependencies': depends
+      }
+      alertWithDependencies = config.AlertWithDependencies(alert['id'], alert[config.DEPENDENCIES_KEY] if config.DEPENDENCIES_KEY in alert else None)
+      self.alertToAlertWithDependencies[alert['id']] = alertWithDependencies
+      alert['resolvedDependencies'] = alertWithDependencies
+      self.alert_list.append(alert)
+
+   def validate(self) :
+      for id, awd in self.alertToAlertWithDependencies.items() :
+         config.validateDependencies(id, awd, self.alertToAlertWithDependencies, MockLogger())
+
+   def checkDepLen(self, id, n) :
+      dep = self.alertToAlertWithDependencies[id]
+      self.assertEqual(len(dep.getDependencies()), n)
+
+class MockLogger() :
+   def __init__(self) :
+      return
+   def info(self, *args, **kwargs) :
+      return
+   def debug(self, *args, **kwargs) :
+      return
+   def error(self, *args, **kwargs) :
+      return
+
+if __name__ == "__main__" :
+   unittest.main()