This commit is contained in:
bel
2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions

View File

View File

@@ -0,0 +1,163 @@
# Contians the arg parser options.
"""Contains the arg parser options."""
import argparse
import sys
def get_builder_args():
"""
Gets the arguments passed in to the aom_builder main call
:return: parser object
"""
parser = argparse.ArgumentParser(
description="Generates a valid yaml file "
"for alerting on metrics. If you are "
"familiar with the yaml structure for an "
"alert you don't have to use this builder,"
" it's just convenient")
parser.add_argument('-q', '--query', help="The Kariosdb query string to "
"use")
parser.add_argument(
'-i', '--interval', type=int, default=60, help="The "
"interval that the check will This value is in seconds")
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
"upper threshold is the value that when reached will "
"cause an depending on the threshold logic. "
"Use in conjunction with lower threshold to define a "
"normal band.")
parser.add_argument(
'-b',
'--lowerthreshold',
help="The lower threshold is the value that when reached will cause an "
"alert depending on the threshold logic"
"Use in conjunction with upper threshold to define a normal band.")
parser.add_argument(
'-m',
'--measure',
choices=[
'gt',
'lt',
'eq'],
help="The measure to use to compare the "
"threshold to the values of the alerts")
parser.add_argument(
'-a',
'--alert_config',
help='A valid Yaml representation of your alerting block')
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_builder run. "
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_tester_service_args():
"""
Gets arguments passed into aom_tester.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics dummy tester "
"service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_service_args():
"""
Gets arguments passed into aom_service.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--alert_routing_lookup',
default=None,
help="If provided will override the folder used to fetch the alerts "
"lookup configuration.")
parser.add_argument(
'-o',
'--override',
action='store_true',
help="Overrides the check leader election value")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def args_to_dict(parsed_args):
"""
Converts the argument parser object to a dict
Args:
parsed_args: Arg parser object
Returns:
Dictionary of arguments
"""
try:
arg_list = parsed_args.parse_args()
# RETURN A DICT OF ARGUMENTS
arg_dict = dict()
for val in vars(arg_list):
arg_dict[val] = getattr(arg_list, val)
return arg_dict
except argparse.ArgumentError:
parsed_args.print_help()
sys.exit(1)

View File

@@ -0,0 +1,277 @@
# config.py
"""Functions for loading alert configuration files"""
import glob
import os
import json
import hashlib
import yaml
import requests
import traceback
# import logging
# logger = logging.getLogger(__name__)
DEPENDENCIES_KEY = 'dependencies'
class AlertWithDependencies:
def __init__(self, alertId, dependencies):
self.alertId = alertId
self.beenProcessed = False
self.dependencies = []
self.addAllDependencies(dependencies)
def addAllDependencies(self, moreDependencies):
if moreDependencies is not None:
self.dependencies.extend(moreDependencies)
def getDependencies(self):
return self.dependencies
def getAlertId(self):
return self.alertId
def hasBeenProcessed(self):
return self.beenProcessed
def visit(self):
self.beenProcessed = True
def md5(fname):
"""Calculates md5 hash of a filename"""
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_healthy_nodes_and_index(consul_url, hostname, logger):
"""Find AOM healthy nodes on consult"""
try:
# getting all registered nodes from consul
r = requests.get(
consul_url +
'/v1/catalog/service/alert-on-metrics',
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
value = json.loads(r.text)
node_list = []
host_index = -1
for elem in value:
node_list.append(elem.get('Node'))
# Retrieving healthy nodes
healthy_nodes = []
for node in node_list:
r2 = requests.get(
consul_url +
'/v1/health/node/' +
node,
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul health"
healthcheck_list = json.loads(r2.text)
for check in healthcheck_list:
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
check.get('Status') == 'passing'):
healthy_nodes.append(node)
try:
healthy_nodes.sort()
host_index = healthy_nodes.index(hostname)
except ValueError:
logger.error("Host is not healthy")
except TimeoutError:
logger.error("Timed out connecting to Consul")
return host_index, len(healthy_nodes)
def distribute_configs(
filename,
host_index,
module,
logger):
"""Uses md5 of alert config to split the files among healthy servers"""
if module == 0:
logger.error("No healthy nodes for the service")
return False
if host_index == -1:
logger.error("Host is unhealthy")
return False
if int(md5(filename), 16) % module == host_index:
return True
return False
def is_valid(alert_config, logger):
"""Checks if alert has all required fields"""
try:
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
assert alert_config['query'], "No Query, this is a dead config"
#assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
assert alert_config['id'], "Alert ID is empty, this is a dead config"
if DEPENDENCIES_KEY in alert_config:
assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
if alert_config.get('query_type') == 'prometheus':
assert isinstance(
alert_config['query'], str), "Invalid Prometheus query"
else:
assert isinstance(
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
{'', 'dc', 'fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in alert_config['query']['metrics'][0]:
defined_tags.update(
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
# AFTER CRITICAL
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'lookup' in alert_config['alerts']:
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
assert all(
isinstance(
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
# if 'occurrences_threshold' in alert_config:
# assert alert_config['occurrences_threshold'] >= 1, \
# "Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
logger.warning("Invalid config file: {}".format(str(e)))
return False
return True
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
"""Check if routing lookup is properly configured"""
try:
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
for alert_routing in alert_routing_lookup:
assert 'alert' in alert_routing, "No alert defined for this configuration."
assert 'tags' in alert_routing, "No tags value defined for this configuration."
for tag in alert_routing['tags']:
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
tag)
assert all(isinstance(tag, str)
for tag in alert_routing['tags']), "Tags must be valid string"
except AssertionError as e:
logger.warning("Invalid alert routing config file: {}".format(str(e)))
return False
return True
# noinspection PyBroadException
def glob_the_configs(
config_path,
lookup_config_path,
consul_url,
hostname,
logger):
"""
Args:
config_path (string): relative path to the configs
consul_url (string): url to consul service
logger:
Returns:
List of configs
"""
invalid_configs = 0
alert_list = []
host_index, module = get_healthy_nodes_and_index(
consul_url, hostname, logger)
alertToAlertWithDependencies = {}
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
if distribute_configs(
config_file,
host_index,
module,
logger):
try:
alert = yaml.safe_load(open(config_file, 'rb').read())
if is_valid(alert, logger):
if 'lookup' in alert['alerts']:
alert_routing_lookup = []
is_valid_lookup = True
if 'lookup_file' in alert['alerts']['lookup']:
lookup_path = "{}/{}".format(
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
if os.path.isfile(lookup_path):
alert_routing_lookup = yaml.safe_load(
open(lookup_path, 'rb').read())
else:
is_valid_lookup = False
else:
alert_routing_lookup = alert['alerts']['lookup']['lookups']
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
alert_routing_lookup, alert, logger)
if is_valid_lookup:
alerts_per_tags = {}
for alert_configuration in alert_routing_lookup:
key = []
for tag in alert['alerts']['lookup']['tags']:
key.append(
alert_configuration['tags'].get(tag))
alerts_per_tags[tuple(
key)] = alert_configuration['alert']
alert['alert_routing_lookup'] = alerts_per_tags
else:
invalid_configs += 1
continue
alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
alertToAlertWithDependencies[alert['id']] = alertWithDependencies
alert['resolvedDependencies'] = alertWithDependencies
alert_list.append(alert)
else:
invalid_configs += 1
except BaseException:
logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
# validate the dependencies and flesh out the dependency graphs
logger.debug("Iterating over dependencies")
for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)
logger.info("Invalid configs: {}".format(invalid_configs))
from serviceapp import service
service.send_stat(
'invalid_configs',
invalid_configs,
dict(),
statprefix='aom')
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list
def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
if len(alertWithDependencies.getDependencies()) > 0:
if not alertWithDependencies.hasBeenProcessed():
alertWithDependencies.visit()
dependencies = list(alertWithDependencies.getDependencies())
for dependentId in dependencies:
if dependentId not in allAlerts:
logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
else:
alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
return alertWithDependencies.getDependencies()
else:
return None

View File

@@ -0,0 +1,122 @@
# logger.py
""" Logging configuration """
import logging
import logging.handlers
import os
logging.getLogger('requests').setLevel(logging.ERROR)
logging.getLogger('urllib3').setLevel(logging.ERROR)
logging.getLogger('werkzeug').setLevel(logging.ERROR)
class SingleLevelFilter(logging.Filter):
def __init__(self, passlevel, reject):
"""
initilizer(constructor) of the singlelevelfilter
@param passlevel (int) - the int value of the level of the log
@param reject (bool) - if true will return if the record level is
not equal to the passlevel
@return SingleLevelFilter object
@note Sets some object parameters
"""
self.passlevel = passlevel
self.reject = reject
def filter(self, record):
"""
Returns True/False depending on parameters
@param record (Log int) - the record that the filter belongs to
@return bool - True/False depending on what self.reject is set to and
what record.levelno and self.passlevel are set to
@note This causes either only logging of the exact same level to get
logged, or only logging other than the same level to get logged
"""
if self.reject:
return record.levelno != self.passlevel
return record.levelno == self.passlevel
class AlertLogging(logging.Logger):
"""
Class Object to handle the logging of the alert on metrics service
starts at Error level and can flip on (and add) an additional log file and
Debug logger as needed.
"""
def __init__(self, name):
"""
Inits the formaters and logger
"""
self.name = name
self.debug_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
"%(message)s", "%m-%d %H:%M:%S")
self.standard_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - %(message)s", "%m-%d %H:%M:%S")
logging.getLogger()
logging.Logger.__init__(self, name, logging.DEBUG)
logging.setLoggerClass(AlertLogging)
def start(self):
"""
Returns:
"""
info_handler = logging.StreamHandler()
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(self.standard_formatter)
self.addHandler(info_handler)
return self
def start_log_file(self, file_path, mode='a'):
"""
Creates a separate log file handler
Args:
file_path: path to the log file
mode: the type of mode to open the file handler with
Returns:
"""
self.log_path = file_path
work_folder = os.path.dirname(file_path)
if work_folder and not os.path.exists(work_folder):
os.makedirs(work_folder)
self.log_handler = logging.FileHandler(file_path, mode)
self.log_handler.setLevel(logging.WARNING)
self.log_handler.setFormatter(self.debug_formatter)
self.addHandler(self.log_handler)
def stop_log_file(self):
"""
Closes Log file and sets the handler to None
Returns:
"""
self.log_handler.close()
self.removeHandler(self.log_handler)
self.log_handler = None
def start_debug(self):
"""
Returns:
"""
self.debug_handler = logging.StreamHandler()
self.debug_handler.setLevel(logging.DEBUG)
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
self.debug_handler.setFormatter(self.debug_formatter)
self.addHandler(self.debug_handler)
def stop_debug(self):
"""
stop the debugger
Returns:
"""
self.removeHandler(self.debug_handler)
self.debug_handler = None

View File

@@ -0,0 +1,83 @@
from datetime import datetime, timedelta
from urllib.parse import urljoin
import requests
class PromAPI:
def __init__(self, endpoint='http://127.0.0.1:9090/'):
"""
:param endpoint: address of
"""
self.endpoint = endpoint
@staticmethod
def _to_timestamp(input_):
"""
Convert string input to UNIX timestamp for Prometheus
:param input_:
:return:
"""
if type(input_) == datetime:
return input_.timestamp()
if input_ == 'now':
return datetime.utcnow().isoformat('T')
if type(input_) is str:
input_ = float(input_)
if type(input_) in [int, float]:
if input_ > 0:
return input_
if input_ == 0: # return now
return datetime.utcnow().isoformat('T')
if input_ < 0:
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
#assert type(input_) == float
def query(self, query='prometheus_build_info'):
return self._get(
uri='/api/v1/query',
params=dict(
query=query
)
)
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
"""Get ser"""
params = {
'query': query
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
if duration:
params['step'] = duration
print(params)
return self._get(
uri='/api/v1/query_range',
params=params
)
def series(self, match='prometheus_build_info', start=-86400, end='now'):
"""Get ser"""
params = {
'match[]': match
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
print(params)
return self._get(
uri='/api/v1/series',
params=params
)
def _get(self, uri, params, method='GET'):
url = urljoin(self.endpoint, uri)
assert method == 'GET'
result = requests.get(
url=url,
params=params
)
return result.json()

View File

@@ -0,0 +1,47 @@
import unittest
import config
class TestAlertWithDependencies(unittest.TestCase) :
def test_base(self) :
self.alertToAlertWithDependencies = {}
self.alert_list = []
self.make_alert("A", ["C"])
self.make_alert("B", ["C"])
self.make_alert("C", ["D"])
self.make_alert("D", None)
self.validate()
self.checkDepLen("A", 2)
self.checkDepLen("B", 2)
self.checkDepLen("C", 1)
self.checkDepLen("D", 0)
def make_alert(self, id, depends) :
alert = {
'id': id,
'dependencies': depends
}
alertWithDependencies = config.AlertWithDependencies(alert['id'], alert[config.DEPENDENCIES_KEY] if config.DEPENDENCIES_KEY in alert else None)
self.alertToAlertWithDependencies[alert['id']] = alertWithDependencies
alert['resolvedDependencies'] = alertWithDependencies
self.alert_list.append(alert)
def validate(self) :
for id, awd in self.alertToAlertWithDependencies.items() :
config.validateDependencies(id, awd, self.alertToAlertWithDependencies, MockLogger())
def checkDepLen(self, id, n) :
dep = self.alertToAlertWithDependencies[id]
self.assertEqual(len(dep.getDependencies()), n)
class MockLogger() :
def __init__(self) :
return
def info(self, *args, **kwargs) :
return
def debug(self, *args, **kwargs) :
return
def error(self, *args, **kwargs) :
return
if __name__ == "__main__" :
unittest.main()