This commit is contained in:
bel
2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions

View File

View File

@@ -0,0 +1,66 @@
class Alert_Config():
def __init__(self, yaml_config) :
if not 'alert_tags' in yaml_config :
yaml_config['alert_tags'] = {}
self.id = str(yaml_config['id'])
self.yaml_config = yaml_config
self.tags = {}
self.state = {}
def type(self) :
if 'type' in self.yaml_config :
return self.yaml_config['type']
return 'kairos'
def tags(self) :
if 'tags' in self.yaml_config :
return self.yaml_config['tags']
return []
def occurrences(self) :
if 'occurrences_threshold' in self.yaml_config :
return self.yaml_config['occurrences_threshold']
return 1
def url(self) :
if 'url' in self.yaml_config :
return self.yaml_config['url']
from os import environ
return environ['AOM_GRAFANA_URL'] + self.id
def get_level(self, key) :
if not key in self.state :
self.state[key] = None
return self.state[key]
def set_level(self, key, value) :
self.state[key] = value
def get_for_tags(self, key) :
if not key in self.tags :
self.tags[key] = 0
return self.tags[key]
def set_for_tags(self, key, value) :
if not key in self.tags :
self.tags[key] = 0
self.tags[key] = value
def init_for_tags(self, key) :
for k in [key, key+"_count"] :
if not key in self.tags :
self.set_for_tags(key, 0)
self.set_for_tags(key+"_noresult", 0)
def get_threshold(isUpper, isWarning) :
if isUpper and isWarning :
return self.try_get_yaml_config('warning_upper_threshold')
if isUpper and not isWarning :
return self.try_get_yaml_config('critical_upper_threshold')
elif not isUpper and isWarning :
return self.try_get_yaml_config('warning_lower_threshold')
elif not isUpper and not isWarning :
return self.try_get_yaml_config('critical_lower_threshold')
def try_get_yaml_config(self, key) :
return self.yaml_config[key] if key in self.yaml_config else None, key in self.yaml_config

View File

@@ -0,0 +1,36 @@
from alert_config import Alert_Config
class Alert_Config_List() :
def __init__(self, alert_configs=None) :
self.hash = {}
if alert_configs :
self.add(alert_configs)
def __getitem__(self, k) :
return self.hash[k]
def __len__(self) :
return len(self.hash)
def add(self, alert_config) :
if isinstance(alert_config, Alert_Config):
self.hash[alert_config.id] = alert_config
elif isinstance(alert_config, list) :
for a in alert_config :
self.add(a)
elif isinstance(alert_config, Alert_Config_List) :
for k in alert_config.hash :
self.add(alert_config.hash[k])
else :
raise Exception("unexpected type added to Alert_Config_List")
def compare(self, other) :
if not other :
other = Alert_Config_List()
self_keys = self.hash.keys()
other_keys = other.hash.keys()
added = other_keys - self_keys
removed = self_keys - other_keys
intersection = [i for i in self_keys if i in other_keys]
modified = [ i for i in intersection if self[i] != other[i] ]
return set(added), set(removed), set(modified)

163
AoM_Service/library/args.py Executable file
View File

@@ -0,0 +1,163 @@
# Contians the arg parser options.
"""Contains the arg parser options."""
import argparse
import sys
def get_builder_args():
"""
Gets the arguments passed in to the aom_builder main call
:return: parser object
"""
parser = argparse.ArgumentParser(
description="Generates a valid yaml file "
"for alerting on metrics. If you are "
"familiar with the yaml structure for an "
"alert you don't have to use this builder,"
" it's just convenient")
parser.add_argument('-q', '--query', help="The Kariosdb query string to "
"use")
parser.add_argument(
'-i', '--interval', type=int, default=60, help="The "
"interval that the check will This value is in seconds")
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
"upper threshold is the value that when reached will "
"cause an depending on the threshold logic. "
"Use in conjunction with lower threshold to define a "
"normal band.")
parser.add_argument(
'-b',
'--lowerthreshold',
help="The lower threshold is the value that when reached will cause an "
"alert depending on the threshold logic"
"Use in conjunction with upper threshold to define a normal band.")
parser.add_argument(
'-m',
'--measure',
choices=[
'gt',
'lt',
'eq'],
help="The measure to use to compare the "
"threshold to the values of the alerts")
parser.add_argument(
'-a',
'--alert_config',
help='A valid Yaml representation of your alerting block')
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_builder run. "
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_tester_service_args():
"""
Gets arguments passed into aom_tester.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics dummy tester "
"service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_service_args():
"""
Gets arguments passed into aom_service.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--alert_routing_lookup',
default=None,
help="If provided will override the folder used to fetch the alerts "
"lookup configuration.")
parser.add_argument(
'-o',
'--override',
action='store_true',
help="Overrides the check leader election value")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def args_to_dict(parsed_args):
"""
Converts the argument parser object to a dict
Args:
parsed_args: Arg parser object
Returns:
Dictionary of arguments
"""
try:
arg_list = parsed_args.parse_args()
# RETURN A DICT OF ARGUMENTS
arg_dict = dict()
for val in vars(arg_list):
arg_dict[val] = getattr(arg_list, val)
return arg_dict
except argparse.ArgumentError:
parsed_args.print_help()
sys.exit(1)

226
AoM_Service/library/config.py Executable file
View File

@@ -0,0 +1,226 @@
# config.py
"""Functions for loading alert configuration files"""
import glob
import os
import json
import hashlib
import yaml
import requests
from serviceapp import service
# import logging
# logger = logging.getLogger(__name__)
def md5(fname):
"""Calculates md5 hash of a filename"""
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_healthy_nodes_and_index(consul_url, hostname, logger):
"""Find AOM healthy nodes on consult"""
try:
# getting all registered nodes from consul
r = requests.get(
consul_url +
'/v1/catalog/service/alert-on-metrics',
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
value = json.loads(r.text)
node_list = []
host_index = -1
for elem in value:
node_list.append(elem.get('Node'))
# Retrieving healthy nodes
healthy_nodes = []
for node in node_list:
r2 = requests.get(
consul_url +
'/v1/health/node/' +
node,
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul health"
healthcheck_list = json.loads(r2.text)
for check in healthcheck_list:
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
check.get('Status') == 'passing'):
healthy_nodes.append(node)
try:
healthy_nodes.sort()
host_index = healthy_nodes.index(hostname)
except ValueError:
logger.error("Host is not healthy")
except TimeoutError:
logger.error("Timed out connecting to Consul")
return host_index, len(healthy_nodes)
def distribute_configs(
filename,
host_index,
module,
logger):
"""Uses md5 of alert config to split the files among healthy servers"""
if module == 0:
logger.error("No healthy nodes for the service")
return False
if host_index == -1:
logger.error("Host is unhealthy")
return False
if int(md5(filename), 16) % module == host_index:
return True
return False
def is_valid(alert_config, logger):
"""Checks if alert has all required fields"""
try:
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
assert alert_config['query'], "No Query, this is a dead config"
assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
assert alert_config['id'], "Alert ID is empty, this is a dead config"
if alert_config.get('query_type') == 'prometheus':
assert isinstance(
alert_config['query'], str), "Invalid Prometheus query"
else:
assert isinstance(
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
{'', 'dc', 'fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in alert_config['query']['metrics'][0]:
defined_tags.update(
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
# AFTER CRITICAL
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'lookup' in alert_config['alerts']:
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
assert all(
isinstance(
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
# if 'occurrences_threshold' in alert_config:
# assert alert_config['occurrences_threshold'] >= 1, \
# "Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
logger.warning("Invalid config file: {}".format(str(e)))
return False
return True
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
"""Check if routing lookup is properly configured"""
try:
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
for alert_routing in alert_routing_lookup:
assert 'alert' in alert_routing, "No alert defined for this configuration."
assert 'tags' in alert_routing, "No tags value defined for this configuration."
for tag in alert_routing['tags']:
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
tag)
assert all(isinstance(tag, str)
for tag in alert_routing['tags']), "Tags must be valid string"
except AssertionError as e:
logger.warning("Invalid alert routing config file: {}".format(str(e)))
return False
return True
# noinspection PyBroadException
def glob_the_configs(
config_path,
lookup_config_path,
consul_url,
hostname,
logger):
"""
Args:
config_path (string): relative path to the configs
consul_url (string): url to consul service
logger:
Returns:
List of configs
"""
invalid_configs = 0
alert_list = []
host_index, module = get_healthy_nodes_and_index(
consul_url, hostname, logger)
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
if distribute_configs(
config_file,
host_index,
module,
logger):
try:
alert = yaml.safe_load(open(config_file, 'rb').read())
if is_valid(alert, logger):
if 'lookup' in alert['alerts']:
alert_routing_lookup = []
is_valid_lookup = True
if 'lookup_file' in alert['alerts']['lookup']:
lookup_path = "{}/{}".format(
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
if os.path.isfile(lookup_path):
alert_routing_lookup = yaml.safe_load(
open(lookup_path, 'rb').read())
else:
is_valid_lookup = False
else:
alert_routing_lookup = alert['alerts']['lookup']['lookups']
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
alert_routing_lookup, alert, logger)
if is_valid_lookup:
alerts_per_tags = {}
for alert_configuration in alert_routing_lookup:
key = []
for tag in alert['alerts']['lookup']['tags']:
key.append(
alert_configuration['tags'].get(tag))
alerts_per_tags[tuple(
key)] = alert_configuration['alert']
alert['alert_routing_lookup'] = alerts_per_tags
else:
invalid_configs += 1
continue
alert_list.append(alert)
else:
invalid_configs += 1
except BaseException as e:
logger.error("Error parsing {} config: {}".format(config_file, e))
logger.info("Invalid configs: {}".format(invalid_configs))
service.send_stat(
'invalid_configs',
invalid_configs,
dict(),
statprefix='aom')
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list

10
AoM_Service/library/job.py Executable file
View File

@@ -0,0 +1,10 @@
import subprocess
class Job() :
def __init__(self, id, p):
self.id = id
self.p = p
def kill(self) :
subprocess.call(["/bin/kill", "-9", "{}".format(self.p.pid)])
self.p.join()

29
AoM_Service/library/job_list.py Executable file
View File

@@ -0,0 +1,29 @@
from job import Job
class Job_List() :
def __init__(self) :
self.jobs = {}
def __getitem__(self, k) :
return self.jobs[k]
def __setitem__(self, k, v) :
self.jobs[k] = v
def __len__(self) :
return len(self.jobs)
def add(self, job) :
if isinstance(job, Job) :
self[job.id] = job
elif isinstance(job, Job_List) :
for j in job.jobs :
self.add(job[j])
else :
raise Exception("unexpected type added to Job_List")
def kill(self, id) :
if not id in self.jobs :
return
self[id].kill()
del(self.jobs[id])

122
AoM_Service/library/logger.py Executable file
View File

@@ -0,0 +1,122 @@
# logger.py
""" Logging configuration """
import logging
import logging.handlers
import os
logging.getLogger('requests').setLevel(logging.ERROR)
logging.getLogger('urllib3').setLevel(logging.ERROR)
logging.getLogger('werkzeug').setLevel(logging.ERROR)
class SingleLevelFilter(logging.Filter):
def __init__(self, passlevel, reject):
"""
initilizer(constructor) of the singlelevelfilter
@param passlevel (int) - the int value of the level of the log
@param reject (bool) - if true will return if the record level is
not equal to the passlevel
@return SingleLevelFilter object
@note Sets some object parameters
"""
self.passlevel = passlevel
self.reject = reject
def filter(self, record):
"""
Returns True/False depending on parameters
@param record (Log int) - the record that the filter belongs to
@return bool - True/False depending on what self.reject is set to and
what record.levelno and self.passlevel are set to
@note This causes either only logging of the exact same level to get
logged, or only logging other than the same level to get logged
"""
if self.reject:
return record.levelno != self.passlevel
return record.levelno == self.passlevel
class AlertLogging(logging.Logger):
"""
Class Object to handle the logging of the alert on metrics service
starts at Error level and can flip on (and add) an additional log file and
Debug logger as needed.
"""
def __init__(self, name):
"""
Inits the formaters and logger
"""
self.name = name
self.debug_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
"%(message)s", "%m-%d %H:%M:%S")
self.standard_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - %(message)s", "%m-%d %H:%M:%S")
logging.getLogger()
logging.Logger.__init__(self, name, logging.DEBUG)
logging.setLoggerClass(AlertLogging)
def start(self):
"""
Returns:
"""
info_handler = logging.StreamHandler()
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(self.standard_formatter)
self.addHandler(info_handler)
return self
def start_log_file(self, file_path, mode='a'):
"""
Creates a separate log file handler
Args:
file_path: path to the log file
mode: the type of mode to open the file handler with
Returns:
"""
self.log_path = file_path
work_folder = os.path.dirname(file_path)
if work_folder and not os.path.exists(work_folder):
os.makedirs(work_folder)
self.log_handler = logging.FileHandler(file_path, mode)
self.log_handler.setLevel(logging.WARNING)
self.log_handler.setFormatter(self.debug_formatter)
self.addHandler(self.log_handler)
def stop_log_file(self):
"""
Closes Log file and sets the handler to None
Returns:
"""
self.log_handler.close()
self.removeHandler(self.log_handler)
self.log_handler = None
def start_debug(self):
"""
Returns:
"""
self.debug_handler = logging.StreamHandler()
self.debug_handler.setLevel(logging.DEBUG)
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
self.debug_handler.setFormatter(self.debug_formatter)
self.addHandler(self.debug_handler)
def stop_debug(self):
"""
stop the debugger
Returns:
"""
self.removeHandler(self.debug_handler)
self.debug_handler = None

14
AoM_Service/library/process.py Executable file
View File

@@ -0,0 +1,14 @@
import multiprocessing
class Process(multiprocessing.Process) :
def __init__(self, alert_config, config, logger, production_mode) :
multiprocessing.Process.__init__(
self,
target=self.get_target(),
args=(alert_config, config, logger, production_mode),
name=alert_config.id,
daemon=True,
)
def get_target(self) :
raise Exception("abstract method not implemented")

View File

@@ -0,0 +1,14 @@
import process_prometheus
import process_kairos
class Process_Factory() :
def __init__(self, config, logger, production) :
self.config = config
self.logger = logger
self.production = production
def build(self, alert_config) :
if alert_config.type() == "prometheus" :
return process_prometheus.Process_Prometheus(alert_config, self.config, self.logger, self.production)
else:
return process_kairos.Process_Kairos(alert_config, self.config, self.logger, self.production)

View File

@@ -0,0 +1,6 @@
import process
from serviceapp import service
class Process_Kairos(process.Process) :
def get_target(self) :
return service.check_kairosdb_alert

View File

@@ -0,0 +1,6 @@
import process
from serviceapp import service
class Process_Prometheus(process.Process) :
def get_target(self) :
return service.check_prometheus_alert

80
AoM_Service/library/service.py Executable file
View File

@@ -0,0 +1,80 @@
import os
from alert_config_list import Alert_Config_List
from alert_config import Alert_Config
from job_list import Job_List
from job import Job
from process_factory import Process_Factory
from time import sleep
from config import glob_the_configs
from serviceapp import service
class Service() :
def __init__(self, logger, reload_interval, hostname, config):
self.alert_config_list = Alert_Config_List()
self.job_list = Job_List()
self.logger = logger
self.info = self.logger.info
self.error = self.logger.error
self.reload_interval = reload_interval
self.box_hostname = os.environ['HOSTNAME'] if hostname is None else hostname
self.production = not "TEST" in os.environ
self.config = config
def start(self) :
self.info("Waiting 15s for Consul service to pass")
sleep(15)
while self.is_running() :
new_alert_config_list = self.get_new_alert_config_list()
self.purge_stale(new_alert_config_list)
self.create_upserted(new_alert_config_list)
self.alert_config_list = new_alert_config_list
total_jobs = len(self.job_list)
self.info("Total running jobs: {}".format(total_jobs))
service.send_stat('total_jobs', total_jobs, dict(), statprefix='aom')
sleep(self.reload_interval)
self.info("Exiting alerts")
self.purge_stale(Alert_Config_List())
def is_running(self) :
return True
def get_new_alert_config_list(self) :
try :
yaml_configs = self.parse_alert_config_files()
alert_configs = [Alert_Config(i) for i in yaml_configs]
return Alert_Config_List(alert_configs)
except Exception as e :
self.error("Failed to load config files: {}".format(e))
return []
def parse_alert_config_files(self) :
path = self.config['alert_folder']
routing = self.config['alert_routing_config']
consul = 'http://consul.service.consul:8500'
return glob_the_configs(path, routing, consul, self.box_hostname, self.logger)
def purge_stale(self, new_alert_config_list) :
_, removed_ids, modified_ids = self.alert_config_list.compare(new_alert_config_list)
stale_ids = removed_ids.union(modified_ids)
for stale_id in stale_ids :
self.job_list.kill(stale_id)
service.send_stat('removed_jobs', len(removed_ids), dict(), statprefix='aom')
self.info("Removed alert_configs: {}".format(removed_ids))
def create_upserted(self, new_alert_config_list) :
added_ids, _, modified_ids = self.alert_config_list.compare(new_alert_config_list)
upserted_ids = added_ids.union(modified_ids)
for id in upserted_ids :
p = self.spawn_process(new_alert_config_list[id])
j = Job(id, p)
self.job_list.add(j)
service.send_stat('new_jobs', len(added_ids), dict(), statprefix='aom')
service.send_stat('modified_jobs', len(modified_ids), dict(), statprefix='aom')
self.info("Added alert_configs: {}".format(added_ids))
self.info("Modified alert_configs: {}".format(added_ids))
def spawn_process(self, alert_config) :
process_factory = Process_Factory(self.config, self.logger, self.production)
process = process_factory.build(alert_config)
process.start()
return process

View File

View File

@@ -0,0 +1,189 @@
from thresholds import Thresholds
class Alert() :
def __init__(self, alert_config, logger, tags, result, min_value, max_value) :
self.occurrences_breached = False
self.new_level_breached = False
self.info = logger.info
self.debug = logger.debug
self.warning = logger.warning
self.error = logger.error
self.alert_config = alert_config
self.thresholds = Thresholds(alert_config)
self.tags = ""
self.result = result
self.set_tags(tags)
self.alert_config.init_for_tags(alert_config.get_tags())
self.set_firing(min_value, max_value)
if availability :
self.info("Sending availability stat 1")
self.send_metrics(self.name(), 0 if self.level() == "CRITICAL" else 1, self.result, 'service_level')
def name(self) :
return "Metric: {} for {}".format(self.alert_config.id, self.get_tags())
def body(self) :
body = ""
if not self.get_firing() :
body = self.get_not_firing_body()
else :
body = self.get_is_firing_body()
self.debug("Alert {}->[{}]->{}, Occurrences={} of {}".format(
self.name(),
self.get_tags(),
self.level(),
self.get_occurrences(),
self.alert_config.occurrences(),
))
self.send_metrics(self.name(), self.level_code(), self.level())
# TODO
return body, md5(tag.encode('utf-8')).hexdigest()[:10]
def level(self) :
if not self.get_firing() :
return "RECOVERY"
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.CRITICAL)] :
return "CRITICAL"
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.WARNING)] :
return "WARNING"
def level_code(self) :
level = self.level()
if level == "RECOVERY" :
return 0
elif level == "WARNING" :
return 0
elif level == "CRITICAL" :
return 0
def get_not_firing_body(self) :
body = ""
body += get_not_firing_body_threshold()
body += get_not_firing_body_occurrences()
if not body :
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
return ""
return "GOOD: " + body
def get_not_firing_body_threshold(self) :
if self.result is None :
return ""
body = ""
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=True)
if not ok :
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=False)
if ok :
body += self.form("<", v)
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=True)
if not ok :
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=False)
if ok :
body += self.form(">", v)
return body
def get_not_firing_body_occurrences(self) :
if not self.get_occurrences() :
return ""
body = ""
if not self.result is None :
self.send_metrics(self.name(), 1, self.level())
else :
body += "{} RECOVERY due to no results found from query. Recommend you manually validate recovery\n{}".format(self.name(), self.alert_config.url())
self.set_occurrences(force=0)
return body
def get_is_firing_body(self) :
body = ""
if self.thresholds.get_breached(level=Thresholds.UPPER) :
body += self.form(">", self.upper_firing)
if self.thresholds.get_breached(level=Thresholds.LOWER) :
body += self.form("<", self.upper_firing)
if self.occurrences_breached :
self.debug("Value {} of {} for tag {} has occurred {} time(s) < threshold of {}".format(
self.value,
self.name(),
self.get_tags(),
self.get_occurrences(),
self.alert_config.occurrences(),
))
return ""
return body
def form(self, operator, static) :
return "{}\n{:.2f} {}= {}\n{}".format(
self.name(),
self.value,
operator,
static,
self.alert_config.url(),
)
def set_tags(self, tags) :
if tags :
self.tags = tags
elif self.result :
import itertools
result_tags = [ self.result['tags'][x] for x in self.alert_config.get_tags() ]
chain = itertools.chain(result_tags)
sorted_list = sorted(list(chain))
self.tags = ", ".join(sorted_list)
if not self.tags :
self.tags = "instance"
def get_tags(self) :
return self.tags
def set_firing(self, min_value, max_value) :
self.thresholds = Thresholds(self.alert_config)
self.thresholds.set_breached(min_value, max_value)
self.set_occurrences()
self.set_new_level_breached()
self.send_metrics()
self.send_threshold_metrics()
def get_firing(self) :
return self.thresholds.get_breached() and self.occurrences_breached
def get_occurrences(self) :
tags = self.get_tags()
return self.alert_config.get_for_tags(tags)
def set_occurrences(self, force=None) :
previous_occurrences = self.get_occurrences()
if self.thresholds.get_breached() :
new_occurrences = previous_occurrences+1
self.alert_config.set_for_tags(self.get_tags(), new_occurrences)
self.occurrences_breached = self.alert_config.occurrences() <= new_occurrences
if force :
self.alert_config.set_for_tags(self.get_tags(), force)
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
def send_metrics(self, *args, **kwargs) :
print("send_metrics not impl")
def set_new_level_breached(self) :
key = self.get_tags()
level = self.level()
previous_level = self.alert_config.get_level(key)
self.new_level_breached = level != previous_level
self.alert_config.set_level(key, level)
self.info("testInfo: {} {}".format(
"NEW" if self.new_level_breached else "EXISTING",
self.level(),
))
def get_new_level_breached(self) :
return self.new_level_breached
def send_threshold_metrics(self) :
# TODO
self.send_metrics(self.alert_config.id, self.value)
for level in [Thresholds.WARNING, Thresholds.CRITICAL] :
for end in [Thresholds.UPPER, Thresholds.LOWER] :
v, ok = self.alert_config.get_threshold(isUpper=level == Thresholds.UPPER, isWarning=end == Thresholds.WARNING)
if ok :
key = "{}_{}_threshold".format(
"upper" if level == Thresholds.UPPER else "lower",
"warning" if level == Thresholds.WARNING else "critical",
)
self.send_stat(key, v, {'id':self.name()})

View File

@@ -0,0 +1,13 @@
from alert import Alert
class Alert_Factory() :
def __init__(self, alert_config, logger) :
self.alert_config = alert_config
self.logger = logger
self.info = logger.info
self.warning = logger.warning
self.debug = logger.debug
self.error = logger.error
def build(self, minvalue, maxvalue, result, tags, availability, alert_tags) :
return Alert(self.alert_config, tags, result, minvalue, maxvalue)

View File

@@ -0,0 +1,83 @@
from datetime import datetime, timedelta
from urllib.parse import urljoin
import requests
class PromAPI:
def __init__(self, endpoint='http://127.0.0.1:9090/'):
"""
:param endpoint: address of
"""
self.endpoint = endpoint
@staticmethod
def _to_timestamp(input_):
"""
Convert string input to UNIX timestamp for Prometheus
:param input_:
:return:
"""
if type(input_) == datetime:
return input_.timestamp()
if input_ == 'now':
return datetime.utcnow().isoformat('T')
if type(input_) is str:
input_ = float(input_)
if type(input_) in [int, float]:
if input_ > 0:
return input_
if input_ == 0: # return now
return datetime.utcnow().isoformat('T')
if input_ < 0:
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
#assert type(input_) == float
def query(self, query='prometheus_build_info'):
return self._get(
uri='/api/v1/query',
params=dict(
query=query
)
)
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
"""Get ser"""
params = {
'query': query
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
if duration:
params['step'] = duration
print(params)
return self._get(
uri='/api/v1/query_range',
params=params
)
def series(self, match='prometheus_build_info', start=-86400, end='now'):
"""Get ser"""
params = {
'match[]': match
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
print(params)
return self._get(
uri='/api/v1/series',
params=params
)
def _get(self, uri, params, method='GET'):
url = urljoin(self.endpoint, uri)
assert method == 'GET'
result = requests.get(
url=url,
params=params
)
return result.json()

View File

@@ -0,0 +1,949 @@
""" Alert On Metrics functions"""
import copy
import itertools
import json
import os
import random
import smtplib
from email.mime.text import MIMEText
from socket import gaierror
from time import sleep
from hashlib import md5
import requests
from statsd import StatsClient
from serviceapp.prom_api import PromAPI
alert_status = [
'RECOVERY',
'WARNING',
'WARNING',
'CRITICAL',
'CRITICAL',
'CRITICAL']
def build_alert_message(alert, minvalue, maxvalue, result, logger,
availability, tag=None, alert_tags=None):
"""
Build the alert message
Args:
alert: the alert object that includes a tag definition
minvalue: the min value to test against the threshold
maxvalue: the max value to test against the threshold
result: the response back from kairosdb
logger (log object): does the logging
availability: Send availability stat 1
tag: If passed in will use this value for the tag instead of
getting it from the result object
alert_tags: the tags corresponding to the result, used if an
alert has to be triggered and a custom routing per tag is configured
Returns:
Alert message string
"""
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
# MAY CHANGE THIS.
# value = maxvalue
# # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
# # (USUALLY A GLOBAL ALL-DC QUERY)
# if tag is None and result is not None:
# tag = ', '.join(sorted(list(itertools.chain(
# *[result['tags'][x] for x in alert['tags']]))))
# tag_count = tag + "_count"
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
# RETURNING RESULTS
# tag_noresult = tag + "_noresult"
# if not tag:
# tag = 'instance'
# logger.debug("No tag specified for alert {}".format(alert['id']))
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
# if 'alert_tags' not in alert:
# alert['alert_tags'] = {}
# if tag not in alert['alert_tags']:
# alert['alert_tags'][tag] = 0
# if tag_count not in alert['alert_tags']:
# alert['alert_tags'][tag_count] = 0
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
# CLEARING EVERYTHING OUT ANYWAY
# alert['alert_tags'][tag_noresult] = 0
# # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
# upper_critical_threshold = None
# upper_warning_threshold = None
# lower_warning_threshold = None
# lower_critical_threshold = None
# upper_threshold = None
# lower_threshold = None
# is_warning_alarm = False
# is_critical_alarm = False
# # UPPER
# upper_threshold_exists = False
# upper_warning_threshold_breached = False
# upper_critical_threshold_breached = False
# if 'warning_upper_threshold' in alert:
# upper_threshold_exists = True
# upper_warning_threshold = alert['warning_upper_threshold']
# upper_threshold = upper_warning_threshold
# if maxvalue >= upper_warning_threshold:
# upper_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_upper_threshold' in alert:
# upper_critical_threshold = alert['critical_upper_threshold']
# if not upper_threshold_exists:
# upper_threshold = upper_critical_threshold
# upper_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
# # OUR THRESHOLD FOR ALERTING
# if maxvalue >= alert['critical_upper_threshold']:
# upper_threshold = upper_critical_threshold
# upper_critical_threshold_breached = True
# is_critical_alarm = True
# upper_threshold_breached = (upper_warning_threshold_breached
# or upper_critical_threshold_breached)
# # LOWER
# lower_threshold_exists = False
# lower_warning_threshold_breached = False
# lower_critical_threshold_breached = False
# if 'warning_lower_threshold' in alert:
# lower_threshold_exists = True
# lower_warning_threshold = alert['warning_lower_threshold']
# lower_threshold = lower_warning_threshold
# if minvalue <= lower_warning_threshold:
# lower_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_lower_threshold' in alert:
# lower_critical_threshold = alert['critical_lower_threshold']
# if not lower_threshold_exists:
# lower_threshold = lower_critical_threshold
# lower_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
# # OUR THRESHOLD FOR ALERTING
# if minvalue <= lower_critical_threshold:
# lower_threshold = lower_critical_threshold
# lower_critical_threshold_breached = True
# is_critical_alarm = True
# lower_threshold_breached = (lower_warning_threshold_breached or
# lower_critical_threshold_breached)
# # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
# if lower_threshold is None and upper_threshold is None:
# logger.debug(
# "ERROR: alert {} does not have any thresholds set on {}".format(
# alert['id'], tag))
# # ON TO OCCURRENCES
# if 'occurrences_threshold' in alert:
# occurrences_threshold = alert['occurrences_threshold']
# else:
# occurrences_threshold = 1
# alert_entity = "Metric: {} for {}".format(alert['id'], tag)
# if 'url' not in alert:
# alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
# ====================
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
# ====================
# alert_body = ''
# if upper_threshold_breached:
# alert_body = "{}\n{:.2f} >= {}\n{}".format(
# alert_entity, value, upper_threshold, alert['url'])
# if lower_threshold_breached:
# value = minvalue
# alert_body = "{}\n{:.2f} <= {}\n{}".format(
# alert_entity, value, lower_threshold, alert['url'])
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
### BREEL TODO ###
# if result is not None:
# send_metrics(alert, value, result)
# if 'critical_upper_threshold' in alert:
# send_stat('upper_critical_threshold', upper_critical_threshold,
# {'id': alert['id']})
# if 'warning_upper_threshold' in alert:
# send_stat('upper_warning_threshold', upper_warning_threshold,
# {'id': alert['id']})
# if 'critical_lower_threshold' in alert:
# send_stat('lower_critical_threshold', lower_critical_threshold,
# {'id': alert['id']})
# if 'warning_lower_threshold' in alert:
# send_stat('lower_warning_threshold', lower_warning_threshold,
# {'id': alert['id']})
# ====================
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
# ====================
#current_alert_status = alert_status[0]
#if not lower_threshold_breached and not upper_threshold_breached:
# # if result is not None:
# # if lower_threshold_exists and not upper_threshold_exists:
# # alert_body = "{}\n{:.2f} > {}\n{}".format(
# # alert_entity, value, lower_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is higher than lower threshold {}"
# # "for value {} on tag {}".format(
# # alert['id'], lower_threshold, value, tag))
# # if upper_threshold_exists and not lower_threshold_exists:
# # alert_body = "{}\n{:.2f} < {}\n{}".format(
# # alert_entity, value, upper_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is below the upper threshold {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, value, tag))
# # if upper_threshold_exists and lower_threshold_exists:
# # alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
# # alert_entity, lower_threshold, value, upper_threshold,
# # alert['url'])
# # logger.debug("GOOD: alert {} is between thresholds {} and {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, lower_threshold,
# # value, tag))
# # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
# # STATE
# #if alert['alert_tags'][tag] > 0:
# # if result is not None:
# # send_metrics(alert, 1, result, current_alert_status)
# # logger.info(
# # "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
# # alert['id'], tag))
# # if result is None:
# # alert_body = ("{} RECOVERY due to no results found from "
# # "KairosDB query. Recommend you manually validate"
# # "recovery.\n{}").format(
# # alert_entity, alert['url'])
# # alert['alert_tags'][tag] = 0
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# #else:
# # # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # # CRITICAL) NEEDS TO BE FIRED
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# # return None
#else:
### BREEL WORKING HERE ###
# ====================
# SET KEY / VALUE FOR TAG ON ALERT
# 0 == No Alert
# 1 == Warning
# 2 == Existing Warning Alert
# 3 == New Critical
# 4+ == Existing Critical Alert
# ====================
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
# alert['alert_tags'][tag_count] += 1
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
# OCCURRENCES SO RETURN IT
# TODO this doesnt belog in Alert.py
#if alert['alert_tags'][tag_count] >= occurrences_threshold:
# # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
# if alert['alert_tags'][tag] < 4:
# if is_warning_alarm and not is_critical_alarm:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
# if alert['alert_tags'][tag] == 0:
# # NEW WARNING
# alert['alert_tags'][tag] = 1
# logger.info("TestInfo: WARNING (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING WARNING
# alert['alert_tags'][tag] = 2
# logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
# alert['id'], tag))
# if is_critical_alarm:
# # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
# if (alert['alert_tags'][tag] == 1 or
# alert['alert_tags'][tag] == 2):
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
# alert['id'], tag))
# else:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
# # LEVEL
# if alert['alert_tags'][tag] < 3:
# # NEW CRITICAL
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING CRITICAL
# alert['alert_tags'][tag] = 4
# logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
# alert['id'], tag))
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
# EVEN IF NOT ACTIVELY ALERTING ON IT
# #if is_critical_alarm:
# #current_alert_status = alert_status[3]
# #send_metrics(alert, 2, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 0")
# # send_metrics(alert, 0, result, 'service_level')
# #if is_warning_alarm and not is_critical_alarm:
# #current_alert_status = alert_status[1]
# #send_metrics(alert, 1, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# logger.debug("{} alert for value {} of {} for tag {} has occurred "
# "{} times. Threshold is >= {} times.".format(
# current_alert_status,
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# else:
# # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # CRITICAL) NEEDS TO BE FIRED
# logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
# "threshold of {}".format(
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# if availability:
# logger.info("Sending availability stat")
# send_metrics(alert, 1, result, 'service_level')
# return None
#logger.debug(
# "Alert {}->[{}]->{}, Occurrences={}".format(
# alert['id'], tag, current_alert_status,
# alert['alert_tags'][tag_count]))
#return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
def check_kairosdb_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
availability = False
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due Grafana limitations
if 'availability' in alert_config and alert_config['availability']:
availability = True
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
query_url = os.path.join(
service_config['kairosdb_url'] +
"api/v1/datapoints/query")
ret = requests.post(
query_url,
data=json.dumps(
alert_config['query']),
timeout=service_config['timeout'])
assert ret.status_code == 200
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret.json()['queries'][0]['results']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r)
if has_custom_alert_routing(alert_config) else None)
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
minvalue = min([x[1] for x in r['values']])
maxvalue = max([x[1] for x in r['values']])
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
minvalue,
maxvalue,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
# A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert for: {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error("Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"KairsoDB query failed: {}\n"
"HTTP status code:\t{}\n"
"Error Message:\t{}\nQuery:\n"
"{}".format(
ret.url,
ret.status_code,
ret.text,
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
def check_prometheus_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due to Grafana limitations
availability = bool(alert_config.get('availability'))
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
ret = prom_api.query_range(
query=alert_config['query'],
start=alert_config['start_time'],
end=alert_config['end_time'],
duration=alert_config['interval'])
assert ret['status'] == 'success'
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret['data']['result']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r) if
has_custom_alert_routing(alert_config) else None)
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
r['tags'] = {key: [value]
for (key, value) in r['metric'].items()}
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
raw_values = [value for _, value in r['values']]
min_value = float(min(raw_values))
max_value = float(max(raw_values))
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
min_value,
max_value,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error(
"Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"Prometheus query failed:\n"
"Status:\t{}\n"
"Error Type:\t{}\n"
"Error Message:\t{}\n"
"Query:\n{}".format(
ret['status'],
ret['errorType'],
ret['error'],
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
def log_alert_results(results, alert_config, logger):
"""
Logs the results broken out by tag provided in the alert_config to the
logger for debuging
Args:
results: the results object returned from the call to kairosdb, of just
the results
alert_config: config object of the alert
logger (log object): does the logging
Returns:
None, logs to logger
"""
for v in results:
logger.debug("{} - Result: {}".format(alert_config['id'], v))
def send_alerts(
alert,
alert_config,
victorops_url,
slack_url,
slack_token,
smtp_server,
sensu_endpoint,
uchiwa_url,
logger):
"""
Sends out the alerts to VO, Email, and/or Slack
Args:
alert: the alert tuple:
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
alert_config: the alert configuration object
victorops_url: url to victorops
slack_url: url to slack api calls
slack_token: the token for the alert
smtp_server: The server to send mail messages too
sensu_endpoint:
uchiwa_url:
logger (log object): does the logging
Returns: None
"""
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
# USED
tag_dict = dict()
tag_dict['alert'] = alert_config['id']
is_custom_alert_routing = has_custom_alert_routing(alert_config)
if is_custom_alert_routing:
alert_routing = alert_config.get('alert_routing_lookup', {})
alert_config['alerts'] = alert_routing.get(
alert[3], alert_config['alerts']['lookup']['default'])
# once we move all alerts into sensu, we dont need to tho this
if 'filters' in alert_config:
logger.info(
"alert_status : {}, alert_config: {}".format(
alert[2], alert_config))
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
1, 2) and alert_config['filters']['slack_subdue']:
# unless the alert is critical we dont send it
logger.info("Removed slack, alert_config: {}".format(alert_config))
alert_config['alerts'].pop('slack', None)
if ('victorops_subdue' in alert_config['filters'] and
alert[2] in (1, 2) and
alert_config['filters']['victorops_subdue']):
# unless the alert is critical we dont send it
alert_config['alerts'].pop('vo', None)
logger.info("Removed vo, alert_config: {}".format(alert_config))
# ====================
# VICTOROPS HANDLING
# ====================
if 'vo' in alert_config['alerts']:
for notify in alert_config['alerts']['vo']:
payload = dict(entity_id=alert[0],
message_type=alert_status[alert[2]],
state_message=alert[1])
r = None
try:
r = requests.post(
victorops_url + notify,
data=json.dumps(payload),
headers={
"Content-type": "application-json"})
assert r.status_code == 200
# Record a VO alert sent event
tag_dict['alert_channel_type'] = "VictorOps"
tag_dict['who'] = "vo:{}".format(notify)
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except AssertionError:
logger.error(
"Post to VO failed for {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to VO: {}".format(
alert_config['id'], str(e)))
# ====================
# EMAIL HANDLING
# ====================
if 'email' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
msg = MIMEText(alert[1])
msg['Subject'] = '{} Status: {}'.format(
alert[0], alert_status[alert[2]])
msg['From'] = 'aom@qualtrics.com'
msg['To'] = ','.join(
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
try:
s = smtplib.SMTP(smtp_server)
s.send_message(msg)
s.quit()
# Record an Email alert sent event
tag_dict['alert_channel_type'] = "Email"
tag_dict['who'] = "email:{}".format(msg['To'])
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except Exception as e:
logger.error(
"Unhandled exception when sending mail for {} to {}\n{}".format(
alert_config['id'], smtp_server, str(e)))
# ====================
# SENSU HANDLING
# ====================
if 'sensu' in alert_config['alerts']:
# Dictionary with static values for Sensu
sensu_dict = {
'source': 'AOM',
'refresh': 3600,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4]}
# if alert[3]:
# logger.info(alert)
# sensu_dict['name'] = '_'.join(
# [alert_config['id']] + sorted(list(alert[3])))
if 'refresh' in alert_config:
sensu_dict['refresh'] = alert_config['refresh']
sensu_dict['interval'] = alert_config['interval']
sensu_dict['handlers'] = []
sensu_dict['dashboard'] = alert_config['url']
if 'dependencies' in alert_config['alerts']['sensu'].keys():
sensu_dict['dependencies'] = (alert_config['alerts']
['sensu']['dependencies'])
if 'victorops' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("victorops")
sensu_dict['routing_key'] = (alert_config['alerts']
['sensu']['victorops'])
# # Leave this here until we have email support in Sensu
# if 'email' in alert_config['alerts']['sensu'].keys():
# sensu_dict['handlers'].append("email")
# # verify this option
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
if 'slack' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("slack")
sensu_dict['slack_channel'] = (
alert_config['alerts']['sensu']['slack'])
# Format alert message
sensu_dict['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
if 'jira' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("jira")
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
if 'filters' in alert_config:
sensu_dict['filters'] = alert_config['filters']
# 0 = OK, 1 = WARNING, 2 = CRITICAL
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict['status'] = sensu_status[alert[2]]
sensu_dict['output'] = alert[1]
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'],
r.status_code,
r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to Sensu: {}".format(
alert_config['id'], str(e)))
# ====================
# SLACK HANDLING - all Slack alerts will go through Sensu
# ====================
if 'slack' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
refresh = alert_config.get('refresh', 3600)
dashboard = alert_config.get('url', '')
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict2 = {'handlers': ['slack'],
'interval': alert_config['interval'],
'source': 'AOM',
'refresh': refresh,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4],
'dashboard': dashboard,
'status': sensu_status[alert[2]],
'output': alert[1]}
if is_custom_alert_routing:
sensu_dict2['name'] = '_'.join(
[alert_config['id']] + list(alert[3]))
sensu_dict2['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
for channel in alert_config['alerts']['slack']:
sensu_dict2['slack_channel'] = channel
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict2),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} when posting"
"to Sensu: {}".format(alert_config['id'], str(e)))
# payload = dict(token=slack_token, channel=channel,
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
# r = None
# try:
# r = requests.post(slack_url, data=payload)
# assert r.status_code == 200
# # Record an Slack alert sent event
# tag_dict['alert_channel_type'] = "Slack"
# tag_dict['who'] = "slack:{}".format(channel)
# send_stat("alert_channel", 1, tag_dict)
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
# except AssertionError:
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
# except Exception as e:
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
# str(e)))
def send_metrics(alert, value, result, gaugename='stats'):
"""
Sends the results from the alert check to statsd
Args:
alert: The Alert config object that holds the alert['tag'] value.
gaugename: The name of the gauge metric we send.
value: The value we want to send as a gauge.
result: The result object from making the call. Use the data in this
object to tag the metric.
Returns: None
"""
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
# SPECIFIC ALERTS
result_tags = list(itertools.chain(
*[result['tags'][x] for x in alert['tags']]))
tag_dict = dict()
for x in range(len(alert['tags'])):
tag_dict[alert['tags'][x]] = result_tags[x]
tag_dict['alert'] = alert['id']
# SEND THE METRIC
send_stat(gaugename, value, tag_dict)
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
"""Sends stats value to statsd"""
client = StatsClient('telegraf', 8125, statprefix)
# SUBMIT STATS
client.gauge(gaugename, value, tags=tag_dict)
def has_custom_alert_routing(alert_config):
"""Checks if alert has custom routing"""
return 'lookup' in alert_config['alerts']
def get_alert_tags(alert_config, query_result):
"""Retrieves custom tags from alert"""
query_tags = []
for tag in alert_config['alerts']['lookup']['tags']:
if (alert_config.get('query_type') == 'prometheus' and
'metric' in query_result and
tag in query_result['metric']):
query_tags.append(query_result['metric'][tag])
elif ('tags' in query_result and tag in query_result['tags']
and query_result['tags'][tag]):
query_tags.append(query_result['tags'][tag][0])
return tuple(query_tags)

View File

@@ -0,0 +1,123 @@
import unittest
class Mock_Alert_Config() :
def __init__(self) :
self.cache = {}
self.level = {}
self.id = "id"
def set_level(self, k, v) :
self.level[k] = v
def get_level(self, k) :
if not k in self.level :
return None
return self.level[k]
def init_for_tags(self, *args) :
pass
def occurrences(self) :
return 1
def get_threshold(self, upper, warning) :
if warning :
return None, False
if upper :
return 10, True
else :
return 0, True
def get_tags(self) :
return "tagsC, tagsD".split(", ")
def set_for_tags(self, key, value) :
if not key in self.cache :
self.cache[key] = 0
self.cache[key] = value
def get_for_tags(self, key) :
if not key in self.cache :
self.cache[key] = 0
return self.cache[key]
class Mock_Result() :
def __init__(self) :
pass
def __getitem__(self, key) :
if key == "tags" :
return self
else :
return key
class Mock_Logger() :
def __init__(self) :
for k in ["error", "warn", "debug", "info", "warning"] :
setattr(self, k, self.log)
def log(self, *args) :
pass
class Test_Alert(unittest.TestCase) :
def test_set_tags(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), None, None, -1, 11)
self.assertEqual(al.get_tags(), "instance")
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
self.assertEqual(al.get_tags(), "tagsA, tagsB")
al.set_tags("a, b, c", res)
self.assertEqual(al.get_tags(), "a, b, c")
al.set_tags("a, b, c", res)
self.assertEqual(al.get_tags(), "a, b, c")
def test_firing(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 11)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 9)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 9)
self.assertFalse(al.get_firing())
def test_str(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
alert = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
self.assertEqual(alert.name(), "Metric: id for tagsA, tagsB")
self.assertEqual(alert.body(), "")
def test_occurrences(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
self.assertEqual(False, al.occurrences_breached)
al.set_occurrences()
al.set_occurrences()
al.set_occurrences()
self.assertEqual(False, al.occurrences_breached)
self.assertEqual(0, ac.get_for_tags(al.get_tags()))
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 11)
self.assertEqual(True, al.occurrences_breached)
al.set_occurrences()
al.set_occurrences()
al.set_occurrences()
self.assertEqual(True, al.occurrences_breached)
self.assertEqual(4, ac.get_for_tags(al.get_tags()))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,33 @@
import unittest
import alert_factory
class Mock_Alert() :
def __init__(self, *args) :
self.args = args
class Mock_Logger() :
def __init__(self) :
self.info = self.log
self.warn = self.log
self.warning = self.log
self.error = self.log
self.debug = self.log
def log(self, *args, **kwargs) :
print(args, kwargs)
class Test_Alert_Factory(unittest.TestCase) :
def setUp(self) :
self.was = alert_factory.Alert
alert_factory.Alert = Mock_Alert
def tearDown(self) :
alert_factory.Alert = self.was
def test(self) :
af = alert_factory.Alert_Factory(None, Mock_Logger())
alert = af.build(0, 5, None, "tagA, tagB", False, "tagC, tagD")
self.assertTrue(type(alert) == Mock_Alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,8 @@
import unittest
class Test_Service(unittest.TestCase) :
def test(self) :
raise Exception("not impl")
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold(unittest.TestCase) :
def test(self) :
import threshold
tl = threshold.Threshold(5)
self.assertFalse(tl.can_breach())
self.assertFalse(tl.exceeds(7))
self.assertFalse(tl.exceeds(3))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold_Lower(unittest.TestCase) :
def test(self) :
import threshold_lower
tl = threshold_lower.Threshold_Lower(5)
self.assertTrue(tl.can_breach)
self.assertTrue(tl.exceeds(3))
self.assertFalse(tl.exceeds(7))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold_Upper(unittest.TestCase) :
def test(self) :
import threshold_upper
tl = threshold_upper.Threshold_Upper(5)
self.assertTrue(tl.can_breach)
self.assertTrue(tl.exceeds(7))
self.assertFalse(tl.exceeds(3))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,157 @@
import unittest
class Mock_Alert_Config() :
def __init__(self) :
self.upCrit = 10
self.lowCrit = 1
def get_threshold(self, upper, warn) :
if upper and warn :
return None, False
elif upper and not warn :
return self.upCrit, True
elif not upper and warn :
return None, False
else:
return self.lowCrit, True
class Test_Thresholds(unittest.TestCase) :
def test_breached_both(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit+1)
should_fire = [
t.critical_breached(),
t.lower_breached(),
t.upper_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.LOWER),
t.end_breached(t.UPPER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.LOWER),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
should_not_fire = [
t.warning_breached(),
t.level_breached(t.WARNING),
t.get_breached(level=t.WARNING),
]
for i in range(len(should_not_fire)) :
self.assertFalse(should_not_fire[i], i)
def test_breached_lower(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit)
should_fire = [
t.critical_breached(),
t.lower_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.LOWER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.LOWER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
should_not_fire = [
t.warning_breached(),
t.upper_breached(),
t.level_breached(t.WARNING),
t.end_breached(t.UPPER),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_not_fire)) :
self.assertFalse(should_not_fire[i], i)
def test_breached_upper(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit, alert_config.upCrit+1)
should_fire = [
t.critical_breached(),
t.upper_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.UPPER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
for i in [
t.warning_breached(),
t.lower_breached(),
t.level_breached(t.WARNING),
t.end_breached(t.LOWER),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.LOWER),
] :
self.assertFalse(i)
def test_breached_notset(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
for i in [
t.warning_breached(),
t.critical_breached(),
t.upper_breached(),
t.lower_breached(),
t.level_breached(t.CRITICAL),
t.level_breached(t.WARNING),
t.end_breached(t.UPPER),
t.end_breached(t.LOWER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.UPPER),
t.get_breached(end=t.LOWER),
] :
self.assertFalse(i)
def test_get_matching(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
self.assertEqual(4, len([i for i in t.get_thresholds_matching()]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.CRITICAL)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.WARNING)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.UPPER)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.UPPER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.UPPER)]))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,19 @@
class Threshold() :
def __init__(self, threshold) :
self.threshold = threshold
self.breached = False
def can_breach(self) :
return False
def set_breached(self, value) :
self.breached = self.exceeds(value)
def get_breached(self) :
return self.breached
def exceeds(self, value) :
return False
def get_threshold(self) :
return self.threshold

View File

@@ -0,0 +1,8 @@
from threshold import Threshold
class Threshold_Lower(Threshold) :
def exceeds(self, value) :
return self.threshold > value
def can_breach(self) :
return True

View File

@@ -0,0 +1,8 @@
from threshold import Threshold
class Threshold_Upper(Threshold) :
def exceeds(self, value) :
return self.threshold < value
def can_breach(self) :
return True

View File

@@ -0,0 +1,67 @@
from threshold_upper import Threshold_Upper
from threshold_lower import Threshold_Lower
from threshold import Threshold
class Thresholds() :
WARNING = True
CRITICAL = False
UPPER = True
LOWER = False
def __init__(self, alert_config) :
self.alert_config = alert_config
self.thresholds = {}
for level in [ Thresholds.WARNING, Thresholds.CRITICAL ] :
self.thresholds[level] = {}
for end in [ Thresholds.UPPER, Thresholds.LOWER ] :
constructor = Threshold_Upper
if end == Thresholds.LOWER :
constructor = Threshold_Lower
self.thresholds[level][end] = self.create_threshold(end, level, constructor)
def create_threshold(self, isUpper, isWarning, constructor) :
value, has = self.alert_config.get_threshold(isUpper, isWarning)
if not has :
constructor = Threshold
return constructor(value)
def warning_breached(self) :
return self.level_breached(Thresholds.WARNING)
def critical_breached(self) :
return self.level_breached(Thresholds.CRITICAL)
def upper_breached(self) :
return self.end_breached(Thresholds.UPPER)
def lower_breached(self) :
return self.end_breached(Thresholds.LOWER)
def level_breached(self, level) :
return self.get_breached(level=level)
def end_breached(self, end) :
return self.get_breached(end=end)
def can_breach(self) :
can_breach = [t for t in self.thresholds.get_thresholds_matching() if not type(t) is Threshold]
return len(can_breach) > 0
def get_breached(self, level=None, end=None) :
for threshold in self.get_thresholds_matching(level=level, end=end) :
if threshold.get_breached() :
return True
return False
def set_breached(self, min_value, max_value) :
for threshold in self.get_thresholds_matching(end=Thresholds.LOWER) :
threshold.set_breached(min_value)
for threshold in self.get_thresholds_matching(end=Thresholds.UPPER) :
threshold.set_breached(max_value)
def get_thresholds_matching(self, level=None, end=None) :
for l in self.thresholds :
if level is None or l == level :
for e in self.thresholds[l] :
if end is None or e == end :
yield self.thresholds[l][e]

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Alert_Config(unittest.TestCase):
def test(self) :
from alert_config import Alert_Config
try :
Alert_Config(None)
self.fail("did not fail on nil yaml_config")
except Exception :
pass
self.assertEqual("a", Alert_Config({"id":"a"}).id)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,54 @@
import unittest
import alert_config_list
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Alert_Config_List(unittest.TestCase):
def setUp(self) :
self.was = alert_config_list.Alert_Config
alert_config_list.Alert_Config = Mock_Alert_Config
self.al = alert_config_list.Alert_Config_List()
def tearDown(self) :
alert_config_list.Alert_Config = self.was
self.al = None
def test_add(self) :
self.al.add(Mock_Alert_Config("a"))
self.assertEqual(len(self.al), 1)
self.al.add([Mock_Alert_Config("a")])
self.assertEqual(len(self.al), 1)
self.al.add([Mock_Alert_Config("b")])
self.assertEqual(len(self.al), 2)
self.al.add(Mock_Alert_Config("c"))
self.assertEqual(len(self.al), 3)
other = alert_config_list.Alert_Config_List()
other.add(Mock_Alert_Config("d"))
self.al.add(other)
self.assertEqual(len(self.al), 4)
def test_compare(self) :
self.al.add(Mock_Alert_Config("a"))
self.al.add(Mock_Alert_Config("b"))
self.al.add(Mock_Alert_Config("c"))
new = alert_config_list.Alert_Config_List()
new.add(Mock_Alert_Config("a"))
new.add(Mock_Alert_Config("y"))
new.add(Mock_Alert_Config("z"))
added, removed, modified = self.al.compare(new)
if not "y" in added or not "z" in added :
self.fail("added is missing elements")
if not "b" in removed or not "c" in removed :
self.fail("removed is missing elements")
if not "a" in modified :
self.fail("modified is missing elements")
if __name__ == "__main__" :
unittest.main()

34
AoM_Service/library/test_job.py Executable file
View File

@@ -0,0 +1,34 @@
import unittest
import job
class Mock_Subprocess() :
called = False
joined = False
pid = None
def __init__(self) :
pass
def call(self, *args, **kwargs) :
self.called = True
def join(self, *args, **kwargs) :
self.joined = True
class Test_Job(unittest.TestCase):
def setUp(self) :
self.was = job.subprocess
self.subprocess = Mock_Subprocess()
job.subprocess = self.subprocess
def tearDown(self) :
job.subprocess = self.was
def test(self) :
p = Mock_Subprocess()
j = job.Job("id", p)
j.kill()
self.assertEqual(p.joined, True)
self.assertEqual(self.subprocess.called, True)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,50 @@
import unittest
import job_list
class Mock_Job() :
def __init__(self, id, p) :
self.id = id
def kill(self) :
return
class Test_Job_List(unittest.TestCase):
def setUp(self) :
self.was = job_list.Job
job_list.Job = Mock_Job
def tearDown(self) :
job_list.Job = self.was
def test_add(self) :
jl = job_list.Job_List()
self.assertEqual(len(jl), 0)
try :
jl.add(None)
self.fail("can add nil to job_list")
except Exception :
pass
jl.add(Mock_Job("a", "a"))
self.assertEqual(len(jl), 1)
jl.add(Mock_Job("a", "a"))
self.assertEqual(len(jl), 1)
jl.add(Mock_Job("b", "b"))
self.assertEqual(len(jl), 2)
other = job_list.Job_List()
other.add(Mock_Job("b", "b"))
other.add(Mock_Job("c", "c"))
jl.add(other)
self.assertEqual(len(jl), 3)
jl.kill("a")
self.assertEqual(len(jl), 2)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,31 @@
import unittest
import process
class Mock_Multiprocessing():
def __init__(self, *args, **kwargs) :
self.args = args
self.kwargs = kwargs
def get_target(self) :
return None
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Process(unittest.TestCase):
def setUp(self) :
self.was = process.multiprocessing.Process
process.multiprocessing.Process = Mock_Multiprocessing
def tearDown(self) :
process.multiprocessing.Process = self.was
def test(self) :
class MockProcess(process.Process) :
def get_target(self) :
return None
p = MockProcess(Mock_Alert_Config("a"), {}, None, True)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,36 @@
import unittest
import process_factory
class Mock_Process_Prometheus() :
def __init__(self, *args, **kwargs) :
pass
class Mock_Process_Kairos() :
def __init__(self, *args, **kwargs) :
pass
class Mock_Alert_Config() :
def __init__(self, type) :
self.t = type
def type(self) :
return self.t
class Test_Process_Factory(unittest.TestCase):
def setUp(self) :
self.was_prom = process_factory.process_prometheus.Process_Prometheus
self.was_kai = process_factory.process_kairos.Process_Kairos
process_factory.process_prometheus.Process_Prometheus = Mock_Process_Prometheus
process_factory.process_kairos.Process_Kairos = Mock_Process_Kairos
def tearDown(self) :
process_factory.process_prometheus.Process_Prometheus = self.was_prom
process_factory.process_kairos.Process_Kairos = self.was_kai
def test(self) :
factory = process_factory.Process_Factory(None, None, None)
self.assertTrue(type(factory.build(Mock_Alert_Config("a"))) is Mock_Process_Kairos)
self.assertTrue(type(factory.build(Mock_Alert_Config("prometheus"))) is Mock_Process_Prometheus)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,15 @@
import unittest
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Process_Kairos(unittest.TestCase):
def test(self) :
import process_kairos
from serviceapp import service
p = process_kairos.Process_Kairos(Mock_Alert_Config("a"), None, None, None)
self.assertEqual(p.get_target(), service.check_kairosdb_alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,15 @@
import unittest
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Process_Prometheus(unittest.TestCase):
def test(self) :
import process_prometheus
from serviceapp import service
p = process_prometheus.Process_Prometheus(Mock_Alert_Config("a"), None, None, None)
self.assertEqual(p.get_target(), service.check_prometheus_alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,100 @@
import unittest
from serviceapp import service as serviceapp
import time
import config
import service
class Mock_ServiceApp_Service() :
def __init__(self, *args, **kwargs) :
self.args = args
self.kwargs = kwargs
def send_stat(self, *args, **kwargs) :
return
class Mock_Logger() :
def __init__(self) :
self.lines = []
self.info = self.log
self.warn = self.log
self.warning = self.log
self.debug = self.log
self.error = self.log
def log(self, *args, **kwargs) :
self.lines.append("{}, {}".format(args, kwargs))
print(self.lines[-1])
def Mock_Sleep(t) :
return
def Mock_Get_Healthy(*args, **kwargs) :
return 0, 1
def Mock_Distribute_Configs(*args, **kwargs) :
return True
def Mock_Is_Valid(*args, **kwargs) :
return True
def ignore_warnings(test_func):
import warnings
def do_test(self, *args, **kwargs):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
test_func(self, *args, **kwargs)
return do_test
class Test_Service(unittest.TestCase) :
def setUp(self) :
self.mock_serviceapp_service = Mock_ServiceApp_Service
self.was_k = serviceapp.check_kairosdb_alert
self.was_p = serviceapp.check_prometheus_alert
self.was_service = service.service
self.was_sleep = time.sleep
self.was_get_healthy = config.get_healthy_nodes_and_index
self.was_distribute = config.distribute_configs
self.was_is_valid = config.is_valid
serviceapp.check_kairosdb_alert = self.mock_serviceapp_service
serviceapp.check_prometheus_alert = self.mock_serviceapp_service
config.get_healthy_nodes_and_index = Mock_Get_Healthy
config.distribute_configs = Mock_Distribute_Configs
config.is_valid = Mock_Is_Valid
serviceapp.sleep = Mock_Sleep
service.sleep = Mock_Sleep
time.sleep = Mock_Sleep
def tearDown(self) :
serviceapp = self.was_service
serviceapp.check_kairosdb_alert = self.was_k
serviceapp.check_prometheus_alert = self.was_p
config.get_healthy_nodes_and_index = self.was_get_healthy
config.distribute_configs = self.was_distribute
config.is_valid = self.was_is_valid
time.sleep = self.was_sleep
serviceapp.sleep = self.was_sleep
service.sleep = self.was_sleep
@ignore_warnings
def test(self) :
import service
logger = Mock_Logger()
s = service.Service(logger, 100, "HOST", {
"alert_folder": "./testdata",
"alert_routing_config": {},
})
global first
first = True
def f() :
global first
is_first = first
first = False
return is_first
def purge_stale(*args) :
return
s.is_running = f
s.purge_stale = purge_stale
s.start()
if __name__ == "__main__" :
unittest.main()

20
AoM_Service/library/testdata/engine.yaml vendored Executable file
View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_engine_failing
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 24
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

18
AoM_Service/library/testdata/fuel.yaml vendored Executable file
View File

@@ -0,0 +1,18 @@
---
id: sleeper_agents_milleniumfalcon_fuellevel_low
service: fuel
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1

20
AoM_Service/library/testdata/lightspeed.yaml vendored Executable file
View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
service: captain
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 48
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['core']

20
AoM_Service/library/testdata/shields.yaml vendored Executable file
View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_shields_unavailable
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
suppressed_occurrences_threshold: 54
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']