# config.py """Functions for loading alert configuration files""" import glob import os import json import hashlib import yaml import requests from serviceapp import service # import logging # logger = logging.getLogger(__name__) def md5(fname): """Calculates md5 hash of a filename""" hash_md5 = hashlib.md5() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def get_healthy_nodes_and_index(consul_url, hostname, logger): """Find AOM healthy nodes on consult""" try: # getting all registered nodes from consul r = requests.get( consul_url + '/v1/catalog/service/alert-on-metrics', timeout=60) assert r.status_code == 200, "Failed to get back a 200 from consul catalog" value = json.loads(r.text) node_list = [] host_index = -1 for elem in value: node_list.append(elem.get('Node')) # Retrieving healthy nodes healthy_nodes = [] for node in node_list: r2 = requests.get( consul_url + '/v1/health/node/' + node, timeout=60) assert r.status_code == 200, "Failed to get back a 200 from consul health" healthcheck_list = json.loads(r2.text) for check in healthcheck_list: if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and check.get('Status') == 'passing'): healthy_nodes.append(node) try: healthy_nodes.sort() host_index = healthy_nodes.index(hostname) except ValueError: logger.error("Host is not healthy") except TimeoutError: logger.error("Timed out connecting to Consul") return host_index, len(healthy_nodes) def distribute_configs( filename, host_index, module, logger): """Uses md5 of alert config to split the files among healthy servers""" if module == 0: logger.error("No healthy nodes for the service") return False if host_index == -1: logger.error("Host is unhealthy") return False if int(md5(filename), 16) % module == host_index: return True return False def is_valid(alert_config, logger): """Checks if alert has all required fields""" try: assert alert_config['alerts'], "No Alerts configured, this is a dead config" assert alert_config['query'], "No Query, this is a dead config" assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid" assert alert_config['id'], "Alert ID is empty, this is a dead config" if alert_config.get('query_type') == 'prometheus': assert isinstance( alert_config['query'], str), "Invalid Prometheus query" else: assert isinstance( alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON" defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union( {'', 'dc', 'fqdn'}) # IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS if 'group_by' in alert_config['query']['metrics'][0]: defined_tags.update( set(alert_config['query']['metrics'][0]['group_by'][0]['tags'])) # for undefined_tag in set(alert_config['tags']).difference(defined_tags): # print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\ # "prevent empty results".format(undefined_tag)) # OUR MINIMUM THRESHOLD NEED assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \ 'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \ "Config must have at least one threshold set." # JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING # AFTER CRITICAL if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config: assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \ "Lower Critical must be less than Lower Warning" if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config: assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \ "Upper Critical must be greater than Upper Warning" if 'lookup' in alert_config['alerts']: assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration' assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][ 'lookup'], 'No lookup configured either in the alert configuration or in a separated file' assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration' assert all( isinstance( tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string' # if 'occurrences_threshold' in alert_config: # assert alert_config['occurrences_threshold'] >= 1, \ # "Having an occurrences value less than 2 is assumed and pointless to specify" except Exception as e: logger.warning("Invalid config file: {}".format(str(e))) return False return True def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger): """Check if routing lookup is properly configured""" try: assert alert_routing_lookup, "No lookup values configured, the configuration is empty." for alert_routing in alert_routing_lookup: assert 'alert' in alert_routing, "No alert defined for this configuration." assert 'tags' in alert_routing, "No tags value defined for this configuration." for tag in alert_routing['tags']: assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format( tag) assert all(isinstance(tag, str) for tag in alert_routing['tags']), "Tags must be valid string" except AssertionError as e: logger.warning("Invalid alert routing config file: {}".format(str(e))) return False return True # noinspection PyBroadException def glob_the_configs( config_path, lookup_config_path, consul_url, hostname, logger): """ Args: config_path (string): relative path to the configs consul_url (string): url to consul service logger: Returns: List of configs """ invalid_configs = 0 alert_list = [] host_index, module = get_healthy_nodes_and_index( consul_url, hostname, logger) for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True): logger.debug("Found {} config".format(config_file)) # LOAD CONFIG if distribute_configs( config_file, host_index, module, logger): try: alert = yaml.safe_load(open(config_file, 'rb').read()) if is_valid(alert, logger): if 'lookup' in alert['alerts']: alert_routing_lookup = [] is_valid_lookup = True if 'lookup_file' in alert['alerts']['lookup']: lookup_path = "{}/{}".format( lookup_config_path, alert['alerts']['lookup']['lookup_file']) if os.path.isfile(lookup_path): alert_routing_lookup = yaml.safe_load( open(lookup_path, 'rb').read()) else: is_valid_lookup = False else: alert_routing_lookup = alert['alerts']['lookup']['lookups'] is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup( alert_routing_lookup, alert, logger) if is_valid_lookup: alerts_per_tags = {} for alert_configuration in alert_routing_lookup: key = [] for tag in alert['alerts']['lookup']['tags']: key.append( alert_configuration['tags'].get(tag)) alerts_per_tags[tuple( key)] = alert_configuration['alert'] alert['alert_routing_lookup'] = alerts_per_tags else: invalid_configs += 1 continue alert_list.append(alert) else: invalid_configs += 1 except BaseException as e: logger.error("Error parsing {} config: {}".format(config_file, e)) logger.info("Invalid configs: {}".format(invalid_configs)) service.send_stat( 'invalid_configs', invalid_configs, dict(), statprefix='aom') logger.info("Loaded {} configs".format(len(alert_list))) return alert_list