QVolution2019.2/AoM_Service/library/config.py

227 lines
9.5 KiB
Python
Executable File

# config.py
"""Functions for loading alert configuration files"""
import glob
import os
import json
import hashlib
import yaml
import requests
from serviceapp import service
# import logging
# logger = logging.getLogger(__name__)
def md5(fname):
"""Calculates md5 hash of a filename"""
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_healthy_nodes_and_index(consul_url, hostname, logger):
"""Find AOM healthy nodes on consult"""
try:
# getting all registered nodes from consul
r = requests.get(
consul_url +
'/v1/catalog/service/alert-on-metrics',
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
value = json.loads(r.text)
node_list = []
host_index = -1
for elem in value:
node_list.append(elem.get('Node'))
# Retrieving healthy nodes
healthy_nodes = []
for node in node_list:
r2 = requests.get(
consul_url +
'/v1/health/node/' +
node,
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul health"
healthcheck_list = json.loads(r2.text)
for check in healthcheck_list:
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
check.get('Status') == 'passing'):
healthy_nodes.append(node)
try:
healthy_nodes.sort()
host_index = healthy_nodes.index(hostname)
except ValueError:
logger.error("Host is not healthy")
except TimeoutError:
logger.error("Timed out connecting to Consul")
return host_index, len(healthy_nodes)
def distribute_configs(
filename,
host_index,
module,
logger):
"""Uses md5 of alert config to split the files among healthy servers"""
if module == 0:
logger.error("No healthy nodes for the service")
return False
if host_index == -1:
logger.error("Host is unhealthy")
return False
if int(md5(filename), 16) % module == host_index:
return True
return False
def is_valid(alert_config, logger):
"""Checks if alert has all required fields"""
try:
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
assert alert_config['query'], "No Query, this is a dead config"
assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
assert alert_config['id'], "Alert ID is empty, this is a dead config"
if alert_config.get('query_type') == 'prometheus':
assert isinstance(
alert_config['query'], str), "Invalid Prometheus query"
else:
assert isinstance(
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
{'', 'dc', 'fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in alert_config['query']['metrics'][0]:
defined_tags.update(
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
# AFTER CRITICAL
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'lookup' in alert_config['alerts']:
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
assert all(
isinstance(
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
# if 'occurrences_threshold' in alert_config:
# assert alert_config['occurrences_threshold'] >= 1, \
# "Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
logger.warning("Invalid config file: {}".format(str(e)))
return False
return True
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
"""Check if routing lookup is properly configured"""
try:
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
for alert_routing in alert_routing_lookup:
assert 'alert' in alert_routing, "No alert defined for this configuration."
assert 'tags' in alert_routing, "No tags value defined for this configuration."
for tag in alert_routing['tags']:
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
tag)
assert all(isinstance(tag, str)
for tag in alert_routing['tags']), "Tags must be valid string"
except AssertionError as e:
logger.warning("Invalid alert routing config file: {}".format(str(e)))
return False
return True
# noinspection PyBroadException
def glob_the_configs(
config_path,
lookup_config_path,
consul_url,
hostname,
logger):
"""
Args:
config_path (string): relative path to the configs
consul_url (string): url to consul service
logger:
Returns:
List of configs
"""
invalid_configs = 0
alert_list = []
host_index, module = get_healthy_nodes_and_index(
consul_url, hostname, logger)
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
if distribute_configs(
config_file,
host_index,
module,
logger):
try:
alert = yaml.safe_load(open(config_file, 'rb').read())
if is_valid(alert, logger):
if 'lookup' in alert['alerts']:
alert_routing_lookup = []
is_valid_lookup = True
if 'lookup_file' in alert['alerts']['lookup']:
lookup_path = "{}/{}".format(
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
if os.path.isfile(lookup_path):
alert_routing_lookup = yaml.safe_load(
open(lookup_path, 'rb').read())
else:
is_valid_lookup = False
else:
alert_routing_lookup = alert['alerts']['lookup']['lookups']
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
alert_routing_lookup, alert, logger)
if is_valid_lookup:
alerts_per_tags = {}
for alert_configuration in alert_routing_lookup:
key = []
for tag in alert['alerts']['lookup']['tags']:
key.append(
alert_configuration['tags'].get(tag))
alerts_per_tags[tuple(
key)] = alert_configuration['alert']
alert['alert_routing_lookup'] = alerts_per_tags
else:
invalid_configs += 1
continue
alert_list.append(alert)
else:
invalid_configs += 1
except BaseException as e:
logger.error("Error parsing {} config: {}".format(config_file, e))
logger.info("Invalid configs: {}".format(invalid_configs))
service.send_stat(
'invalid_configs',
invalid_configs,
dict(),
statprefix='aom')
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list