278 lines
12 KiB
Python
Executable File
278 lines
12 KiB
Python
Executable File
# config.py
|
|
"""Functions for loading alert configuration files"""
|
|
import glob
|
|
import os
|
|
import json
|
|
import hashlib
|
|
import yaml
|
|
import requests
|
|
import traceback
|
|
|
|
# import logging
|
|
# logger = logging.getLogger(__name__)
|
|
|
|
DEPENDENCIES_KEY = 'dependencies'
|
|
|
|
class AlertWithDependencies:
|
|
def __init__(self, alertId, dependencies):
|
|
self.alertId = alertId
|
|
self.beenProcessed = False
|
|
self.dependencies = []
|
|
self.addAllDependencies(dependencies)
|
|
|
|
def addAllDependencies(self, moreDependencies):
|
|
if moreDependencies is not None:
|
|
self.dependencies.extend(moreDependencies)
|
|
|
|
def getDependencies(self):
|
|
return self.dependencies
|
|
|
|
def getAlertId(self):
|
|
return self.alertId
|
|
|
|
def hasBeenProcessed(self):
|
|
return self.beenProcessed
|
|
|
|
def visit(self):
|
|
self.beenProcessed = True
|
|
|
|
def md5(fname):
|
|
"""Calculates md5 hash of a filename"""
|
|
hash_md5 = hashlib.md5()
|
|
with open(fname, "rb") as f:
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
hash_md5.update(chunk)
|
|
return hash_md5.hexdigest()
|
|
|
|
|
|
def get_healthy_nodes_and_index(consul_url, hostname, logger):
|
|
"""Find AOM healthy nodes on consult"""
|
|
try:
|
|
# getting all registered nodes from consul
|
|
r = requests.get(
|
|
consul_url +
|
|
'/v1/catalog/service/alert-on-metrics',
|
|
timeout=60)
|
|
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
|
|
|
|
value = json.loads(r.text)
|
|
node_list = []
|
|
host_index = -1
|
|
for elem in value:
|
|
node_list.append(elem.get('Node'))
|
|
|
|
# Retrieving healthy nodes
|
|
healthy_nodes = []
|
|
for node in node_list:
|
|
r2 = requests.get(
|
|
consul_url +
|
|
'/v1/health/node/' +
|
|
node,
|
|
timeout=60)
|
|
assert r.status_code == 200, "Failed to get back a 200 from consul health"
|
|
healthcheck_list = json.loads(r2.text)
|
|
for check in healthcheck_list:
|
|
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
|
|
check.get('Status') == 'passing'):
|
|
healthy_nodes.append(node)
|
|
|
|
try:
|
|
healthy_nodes.sort()
|
|
host_index = healthy_nodes.index(hostname)
|
|
except ValueError:
|
|
logger.error("Host is not healthy")
|
|
except TimeoutError:
|
|
logger.error("Timed out connecting to Consul")
|
|
return host_index, len(healthy_nodes)
|
|
|
|
|
|
def distribute_configs(
|
|
filename,
|
|
host_index,
|
|
module,
|
|
logger):
|
|
"""Uses md5 of alert config to split the files among healthy servers"""
|
|
if module == 0:
|
|
logger.error("No healthy nodes for the service")
|
|
return False
|
|
if host_index == -1:
|
|
logger.error("Host is unhealthy")
|
|
return False
|
|
if int(md5(filename), 16) % module == host_index:
|
|
return True
|
|
return False
|
|
|
|
|
|
def is_valid(alert_config, logger):
|
|
"""Checks if alert has all required fields"""
|
|
try:
|
|
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
|
|
assert alert_config['query'], "No Query, this is a dead config"
|
|
#assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
|
|
assert alert_config['id'], "Alert ID is empty, this is a dead config"
|
|
if DEPENDENCIES_KEY in alert_config:
|
|
assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
|
|
if alert_config.get('query_type') == 'prometheus':
|
|
assert isinstance(
|
|
alert_config['query'], str), "Invalid Prometheus query"
|
|
else:
|
|
assert isinstance(
|
|
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
|
|
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
|
|
{'', 'dc', 'fqdn'})
|
|
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
|
|
if 'group_by' in alert_config['query']['metrics'][0]:
|
|
defined_tags.update(
|
|
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
|
|
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
|
|
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
|
|
# "prevent empty results".format(undefined_tag))
|
|
# OUR MINIMUM THRESHOLD NEED
|
|
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
|
|
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
|
|
"Config must have at least one threshold set."
|
|
|
|
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
|
|
# AFTER CRITICAL
|
|
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
|
|
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
|
|
"Lower Critical must be less than Lower Warning"
|
|
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
|
|
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
|
|
"Upper Critical must be greater than Upper Warning"
|
|
|
|
if 'lookup' in alert_config['alerts']:
|
|
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
|
|
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
|
|
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
|
|
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
|
|
assert all(
|
|
isinstance(
|
|
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
|
|
|
|
# if 'occurrences_threshold' in alert_config:
|
|
# assert alert_config['occurrences_threshold'] >= 1, \
|
|
# "Having an occurrences value less than 2 is assumed and pointless to specify"
|
|
except Exception as e:
|
|
logger.warning("Invalid config file: {}".format(str(e)))
|
|
return False
|
|
return True
|
|
|
|
|
|
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
|
|
"""Check if routing lookup is properly configured"""
|
|
try:
|
|
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
|
|
for alert_routing in alert_routing_lookup:
|
|
assert 'alert' in alert_routing, "No alert defined for this configuration."
|
|
assert 'tags' in alert_routing, "No tags value defined for this configuration."
|
|
for tag in alert_routing['tags']:
|
|
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
|
|
tag)
|
|
assert all(isinstance(tag, str)
|
|
for tag in alert_routing['tags']), "Tags must be valid string"
|
|
except AssertionError as e:
|
|
logger.warning("Invalid alert routing config file: {}".format(str(e)))
|
|
return False
|
|
return True
|
|
|
|
|
|
# noinspection PyBroadException
|
|
def glob_the_configs(
|
|
config_path,
|
|
lookup_config_path,
|
|
consul_url,
|
|
hostname,
|
|
logger):
|
|
"""
|
|
Args:
|
|
config_path (string): relative path to the configs
|
|
consul_url (string): url to consul service
|
|
logger:
|
|
Returns:
|
|
List of configs
|
|
"""
|
|
invalid_configs = 0
|
|
alert_list = []
|
|
host_index, module = get_healthy_nodes_and_index(
|
|
consul_url, hostname, logger)
|
|
alertToAlertWithDependencies = {}
|
|
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
|
|
logger.debug("Found {} config".format(config_file))
|
|
# LOAD CONFIG
|
|
if distribute_configs(
|
|
config_file,
|
|
host_index,
|
|
module,
|
|
logger):
|
|
try:
|
|
alert = yaml.safe_load(open(config_file, 'rb').read())
|
|
if is_valid(alert, logger):
|
|
if 'lookup' in alert['alerts']:
|
|
alert_routing_lookup = []
|
|
is_valid_lookup = True
|
|
if 'lookup_file' in alert['alerts']['lookup']:
|
|
lookup_path = "{}/{}".format(
|
|
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
|
|
if os.path.isfile(lookup_path):
|
|
alert_routing_lookup = yaml.safe_load(
|
|
open(lookup_path, 'rb').read())
|
|
else:
|
|
is_valid_lookup = False
|
|
else:
|
|
alert_routing_lookup = alert['alerts']['lookup']['lookups']
|
|
|
|
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
|
|
alert_routing_lookup, alert, logger)
|
|
|
|
if is_valid_lookup:
|
|
alerts_per_tags = {}
|
|
for alert_configuration in alert_routing_lookup:
|
|
key = []
|
|
for tag in alert['alerts']['lookup']['tags']:
|
|
key.append(
|
|
alert_configuration['tags'].get(tag))
|
|
alerts_per_tags[tuple(
|
|
key)] = alert_configuration['alert']
|
|
alert['alert_routing_lookup'] = alerts_per_tags
|
|
else:
|
|
invalid_configs += 1
|
|
continue
|
|
alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
|
|
alertToAlertWithDependencies[alert['id']] = alertWithDependencies
|
|
alert['resolvedDependencies'] = alertWithDependencies
|
|
alert_list.append(alert)
|
|
else:
|
|
invalid_configs += 1
|
|
except BaseException:
|
|
logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
|
|
# validate the dependencies and flesh out the dependency graphs
|
|
logger.debug("Iterating over dependencies")
|
|
for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
|
|
validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)
|
|
|
|
logger.info("Invalid configs: {}".format(invalid_configs))
|
|
from serviceapp import service
|
|
service.send_stat(
|
|
'invalid_configs',
|
|
invalid_configs,
|
|
dict(),
|
|
statprefix='aom')
|
|
logger.info("Loaded {} configs".format(len(alert_list)))
|
|
return alert_list
|
|
|
|
def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
|
|
if len(alertWithDependencies.getDependencies()) > 0:
|
|
if not alertWithDependencies.hasBeenProcessed():
|
|
alertWithDependencies.visit()
|
|
dependencies = list(alertWithDependencies.getDependencies())
|
|
for dependentId in dependencies:
|
|
if dependentId not in allAlerts:
|
|
logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
|
|
else:
|
|
alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
|
|
logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
|
|
return alertWithDependencies.getDependencies()
|
|
else:
|
|
return None
|