This commit is contained in:
bel
2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions

15
sleeper_agents_aom_engine/.gitignore vendored Executable file
View File

@@ -0,0 +1,15 @@
# Created by .ignore support plugin (hsz.mobi)
### Vagrant template
.vagrant/
.idea/
build/results
logs/
*.pyc
.dockerignore
Dockerfile
build/builder
site-packages.tar.gz
alert_configs
AoM_Configs

3
sleeper_agents_aom_engine/.gitmodules vendored Executable file
View File

@@ -0,0 +1,3 @@
[submodule "AlertOnMetrics"]
path = AoM_Configs
url = ssh://git@gitlab-app.eng.qops.net:10022/sleeper-agents/AlertOnMetrics.git

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = '127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "No build required"
}
}
stage('Test') {
steps {
echo "Test done during merge request"
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
}
}
stage('Deploy') {
steps {
script {
if ("$GIT_BRANCH" == "origin/master"){
echo "Running publish script"
sh script: './publish.sh'
echo "Triggering Rundeck job"
script {
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c5323400-0d97-4488-8cf2-1d736a5f7fb9', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
}
}
else {
echo "No deploy step required."
}
}
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = '127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "Building AOM container"
sh script: 'docker build . -t aom_test_container'
}
}
stage('Test') {
steps {
echo "Launching container on test mode. It will take a few minutes."
sh script: 'docker run -e TEST=true -h $(hostname) --add-host=\"telegraf:$(nslookup jenkins.eng.qops.net|grep Server | awk \'{print $2}\')\" aom_test_container'
echo "Removing docker image and container"
sh script: 'docker rmi -f aom_test_container'
}
}
stage('Deploy') {
steps {
echo "No deploy step required for Merge Request"
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,8 @@
# IMPORTANT NOTICE:
Alert configurations have been moved to [AlertOnMetrics]
(https://gitlab-app.eng.qops.net/engvis/AlertOnMetricsConfigs).
This will allow more flexibility to the project. Merge requests will
be automatically validated, merged and deployed if it passes the
validation stage.

View File

@@ -0,0 +1,240 @@
#!/usr/bin/python3
""" Alert On Metrics Project"""
import logging
import multiprocessing
import json
import base64
import os
import subprocess
from time import time, sleep
import requests
import yaml
import traceback
from sanic import Sanic, response
from library.args import get_service_args
from serviceapp import service
from library.config import glob_the_configs
from library.logger import AlertLogging
LOG = AlertLogging('aom')
LOG.start()
LOG.start_log_file("logs/aom_service.log")
LOG.start_debug()
APP = Sanic()
SERVICE_JOB = multiprocessing.Value('i', 0)
NUM_JOBS = multiprocessing.Value('i', 0)
LEADERSHIP = multiprocessing.Value('i', 0)
LEADER_STATUS = None
LEADER_TIME = None
CONSUL_URL = None
LEADER_OVERRIDE = None
HOSTNAME = None
SERVICE_CONFIG = None
# move to library
def dict_compare(d1, d2):
"""
Function to compare two dictionaries
"""
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
intersect_keys = d1_keys.intersection(d2_keys)
added = d1_keys - d2_keys
removed = d2_keys - d1_keys
modified = set(o for o in intersect_keys if d1[o] != d2[o])
return added, removed, modified #, same
@APP.route("/")
async def index(_):
"""
Return total number of jobs
"""
global NUM_JOBS
return response.json({"job_count": NUM_JOBS.value})
@APP.route('/healthcheck')
async def health(request):
"""
Flask healthcheck so that consul and friends work, see this as a service
Returns:
json object of status: ok
"""
LOG.debug("healthcheck")
service_process = multiprocessing.Process(target=start_service, \
args=(LOG, SERVICE_CONFIG['alert_reload_interval']), \
name="service", daemon=False)
# TRY TO START SERVICE, IF LEADER AND NOT RUNNING
if SERVICE_JOB.value == 0:
LOG.info("Starting alerts background job")
SERVICE_JOB.value += 1
service_process.start()#start_service(log)
return response.json({"status": "ok"}, 200)
# @APP.route("/override")
# async def override(request):
# """
# Sets the LEADER_OVERRIDE global parameter to force an override
# """
# global LEADER_OVERRIDE
# if request.args.get('enable') == 'true':
# LEADER_OVERRIDE = True
# elif request.args.get('enable') == 'false':
# LEADER_OVERRIDE = False
# return response.json({"override": LEADER_OVERRIDE})
# def leader():
# """
# Needs to be implemented that goes out to consul and checks if node is leader,
# or if there is no leader volunteers itself.
# Returns:
# bool of True or False.... once the logic gets worked out
# """
# global LEADER_STATUS, LEADER_TIME
# # CHECK IF THERE IS AN ARGUMENT FOR OVERRIDING THE CHECK LEADER
# if LEADER_OVERRIDE is True:
# return True
# # CHECK IF LEADER_TIME IS SET AND THAT IT'S LESS THAN 30 SECONDS FROM LAST SET
# if LEADER_TIME is None or time() - LEADER_TIME > 60:
# LOG.info("Cache has expired or was not set")
# box_hostname = os.environ['HOSTNAME'] if HOSTNAME is None else HOSTNAME
# LOG.info("Getting Leader Election status")
# # RIGHT NOW IN THE CONFIG THIS IS HARD SET TO THE CONSUL1-APP.ENG.OPS.NET
# try:
# r = requests.get(CONSUL_URL, timeout=60)
# assert r.status_code == 200, "Failed to get back a 200 from consul."
# LOG.info("Verify that the Value is {}".format(box_hostname))
# # THE VALUE BACK IS A BASE64 ENCODED BYTE, THAT NEEDS TO BE DECODED,
# # TURNED TO A STRING, THEN TO A DICT
# value = json.loads(base64.b64decode(r.json()[0]['Value']).decode('utf-8'))
# # CACHE THE VALUE AND TIMESTAMP
# if value['HostName'] == box_hostname:
# LEADER_STATUS = True
# LEADER_TIME = time()
# return True
# else:
# return False
# except TimeoutError:
# LOG.error("Timed out connecting to Consul")
# return LEADER_STATUS
# else:
# return LEADER_STATUS
def start_service(log, reload_interval):
"""
Starts the service
Args:
None
Returns:
None
"""
jobs = []
alert_list = []
alert_hash = {}
box_hostname = os.environ['HOSTNAME'] if HOSTNAME is None else HOSTNAME
production_mode = not "TEST" in os.environ
# WAIT FOR LEADER ELECTION TO PASS
# while not leader():
# return False
# # GLOB ALL THE CONFIG FILES TO BUILD POOL OF ALERTS
log.info("Waiting 15s for Consul service to pass")
#sleep(15)
while True:
try:
alert_list = glob_the_configs(SERVICE_CONFIG['alert_folder'], \
SERVICE_CONFIG['alert_routing_lookup'], \
'http://consul.service.consul:8500', box_hostname, log)
except Exception:
log.error("Failed to load config files: {}".format(traceback.format_exc()))
# CREATE THREAD POOL, TO PREVENT RECURSIVE INCLUSION WITH
# MULTIPROCESSING, MOVED FUNCTION TO ANOTHER FILE
log.info("Found {} alerts".format(len(alert_list)))
new_alert_hash = {}
for alert_config in alert_list:
if alert_config['id'] in new_alert_hash.keys():
log.info("Duplicate alert id found: {}. \
Ignoring one of them.".format(alert_config['id']))
else:
new_alert_hash[alert_config['id']] = alert_config
added, removed, modified = dict_compare(new_alert_hash, alert_hash)
log.info("Added alerts {}".format(added))
log.info("Removed alerts {}".format(removed))
log.info("Modified alerts {}".format(modified))
# PROCESSES TO KILL
for al_config in removed.union(modified):
position = None
# Find if process is currently running
for i, job in enumerate(jobs):
if job.name == al_config and job.is_alive():
position = i
# once found exit loop
break
# Terminate process and remove it from the list
log.info("Stopping config: {}".format(jobs[position].name))
subprocess.call(["/bin/kill", "-9", "{}".format(jobs[position].pid)])
jobs[position].join()
NUM_JOBS.value -= 1
log.info("Process stopped succesfully")
jobs.pop(position)
# PROCESSES TO START
alert_configurations = added.union(modified)
alert_configurations = sorted(alert_configurations, key=lambda x:len(new_alert_hash[x].get('resolvedDependencies').getDependencies()))
for al_config in added.union(modified):
if new_alert_hash[al_config].get('query_type') == 'prometheus':
p = multiprocessing.Process(target=service.check_prometheus_alert, \
args=(new_alert_hash[al_config], SERVICE_CONFIG, log, production_mode), \
name=al_config, daemon=True)
else:
p = multiprocessing.Process(target=service.check_kairosdb_alert,\
args=(new_alert_hash[al_config], SERVICE_CONFIG, log, production_mode), \
name=al_config, daemon=True)
jobs.append(p)
log.info("Starting new config: {}".format(p.name))
p.start()
NUM_JOBS.value += 1
# store current list
alert_hash = new_alert_hash.copy()
log.info("Total number of jobs: {}".format(NUM_JOBS.value))
service.send_stat('total_jobs', NUM_JOBS.value, dict(), statprefix='aom')
if added:
service.send_stat('new_jobs', len(added), dict(), statprefix='aom')
if modified:
service.send_stat('modified_jobs', len(modified), dict(), statprefix='aom')
if removed:
service.send_stat('removed_jobs', len(removed), dict(), statprefix='aom')
sleep(reload_interval)
#No longer leader killing all processes
log.info("No longer leader. Exiting alerts background job")
for job in jobs:
# job.terminate() causes the server to stop
subprocess.call(["/bin/kill", "-9", "{}".format(job.pid)])
NUM_JOBS.value -= 1
SERVICE_JOB.value = 0
return False
if __name__ == "__main__":
# GET ARGS AND START LOGGING
ARGS = get_service_args()
logging.setLoggerClass(AlertLogging)
LOG.info("Starting Service")
# GET SERVICE CONFIG
LEADER_OVERRIDE = ARGS['override']
HOSTNAME = ARGS['hostname']
SERVICE_CONFIG = yaml.safe_load(open('service.yaml', 'r').read())
if ARGS['alert_configs'] is not None:
SERVICE_CONFIG['alert_folder'] = ARGS['alert_configs']
if ARGS['alert_routing_lookup'] is not None:
SERVICE_CONFIG['alert_routing_lookup'] = ARGS['alert_routing_lookup']
# SET CONSUL URL FOR LEADER CHECK
CONSUL_URL = SERVICE_CONFIG['consul_url']
# START THE MAIN SERVICE
APP.run(host="0.0.0.0", port=ARGS['port'])

View File

@@ -0,0 +1,121 @@
import json
import time
import requests
import yaml
service_config = yaml.load(open('service.yaml', 'r').read())
kairos_url = service_config['kairosdb_url'] + "api/v1/datapoints/"
kairos_query = kairos_url + "query"
metrics_list = []
status1 = "RECOVERY"
status2 = "WARNING"
status3 = "CRITICAL"
json_string1 = """{"name": "aom_test_metric","datapoints": """
json_string2 = ""","tags": {"host": "aom_host","data_center": "AOM"},"ttl": 500}"""
# WRITE ALERT CONFIG FILE
alert_file = {'alerts': {'sensu': {'slack': 'aom_test_channel'}},
'critical_lower_threshold': 100,
'critical_upper_threshold': 5000,
'id': 'test_metric',
'interval': 30,
'occurrences_threshold': 1,
'query': {'cache_time': 0,
'end_relative': {'unit': 'seconds', 'value': '30'},
'metrics': [{'name': 'aom_test_metric', 'tags': {}}],
'start_relative': {'unit': 'seconds', 'value': '60'}},
'tags': {},
'url': 'AOM_TESTING',
'warning_lower_threshold': 1000,
'warning_upper_threshold': 2000}
query_intro = """{
"metrics": [
{
"tags": {
"alert": [
"test_metric"
]
},
"name": "telegraf.aom_"""
query_outro = """_value",
"aggregators": [
{
"name": "sum",
"align_sampling": true,
"sampling": {
"value": "9",
"unit": "minutes"
},
"align_start_time": false
}
]
}
],
"cache_time": 0,
"start_relative": {
"value": "8",
"unit": "minutes"
}
}"""
def main():
# noinspection PyBroadException
try:
with open('alert_configs/test.yaml', 'w') as yaml_file:
yaml.dump(alert_file, yaml_file, default_flow_style=False)
except Exception:
print("Error writing alert config file")
return False
now = int(time.time() * 1000)
metrics_list.append([now, 1501])
now += 32000
metrics_list.append([now, 202])
now += 32000
metrics_list.append([now, 23])
now += 32000
metrics_list.append([now, 1504])
now += 32000
metrics_list.append([now, 2005])
now += 32000
metrics_list.append([now, 5006])
now += 32000
metrics_list.append([now, 1507])
full_string = json_string1 + str(metrics_list) + json_string2
try:
ret = requests.post(kairos_url, data=json.dumps(json.loads(full_string)), timeout=200)
assert ret.status_code == 204, "Wrong status code received from KairosDB"
except AssertionError as e:
print("Error: {}".format(str(e)))
except Exception as e:
print("Problem talking to KairosDB: {}".format(str(e)))
return False
print("Metrics sent to KairosDB. Check alerts in the #aom_test_channel in Slack")
time.sleep(360)
try:
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status1 + query_outro)), timeout=200)
print("Recovery {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong RECOVERY result"
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status2 + query_outro)), timeout=200)
print("Warning {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong WARNING result"
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status3 + query_outro)), timeout=200)
print("Critical {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 4, "Wrong CRITICAL result"
except AssertionError as e:
print("Error: {}".format(str(e)))
except Exception as e:
print("Problem getting results from KairosDB: {}".format(str(e)))
return False
return True
if __name__ == '__main__':
main()

View File

View File

@@ -0,0 +1,163 @@
# Contians the arg parser options.
"""Contains the arg parser options."""
import argparse
import sys
def get_builder_args():
"""
Gets the arguments passed in to the aom_builder main call
:return: parser object
"""
parser = argparse.ArgumentParser(
description="Generates a valid yaml file "
"for alerting on metrics. If you are "
"familiar with the yaml structure for an "
"alert you don't have to use this builder,"
" it's just convenient")
parser.add_argument('-q', '--query', help="The Kariosdb query string to "
"use")
parser.add_argument(
'-i', '--interval', type=int, default=60, help="The "
"interval that the check will This value is in seconds")
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
"upper threshold is the value that when reached will "
"cause an depending on the threshold logic. "
"Use in conjunction with lower threshold to define a "
"normal band.")
parser.add_argument(
'-b',
'--lowerthreshold',
help="The lower threshold is the value that when reached will cause an "
"alert depending on the threshold logic"
"Use in conjunction with upper threshold to define a normal band.")
parser.add_argument(
'-m',
'--measure',
choices=[
'gt',
'lt',
'eq'],
help="The measure to use to compare the "
"threshold to the values of the alerts")
parser.add_argument(
'-a',
'--alert_config',
help='A valid Yaml representation of your alerting block')
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_builder run. "
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_tester_service_args():
"""
Gets arguments passed into aom_tester.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics dummy tester "
"service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_service_args():
"""
Gets arguments passed into aom_service.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--alert_routing_lookup',
default=None,
help="If provided will override the folder used to fetch the alerts "
"lookup configuration.")
parser.add_argument(
'-o',
'--override',
action='store_true',
help="Overrides the check leader election value")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def args_to_dict(parsed_args):
"""
Converts the argument parser object to a dict
Args:
parsed_args: Arg parser object
Returns:
Dictionary of arguments
"""
try:
arg_list = parsed_args.parse_args()
# RETURN A DICT OF ARGUMENTS
arg_dict = dict()
for val in vars(arg_list):
arg_dict[val] = getattr(arg_list, val)
return arg_dict
except argparse.ArgumentError:
parsed_args.print_help()
sys.exit(1)

View File

@@ -0,0 +1,277 @@
# config.py
"""Functions for loading alert configuration files"""
import glob
import os
import json
import hashlib
import yaml
import requests
import traceback
# import logging
# logger = logging.getLogger(__name__)
DEPENDENCIES_KEY = 'dependencies'
class AlertWithDependencies:
def __init__(self, alertId, dependencies):
self.alertId = alertId
self.beenProcessed = False
self.dependencies = []
self.addAllDependencies(dependencies)
def addAllDependencies(self, moreDependencies):
if moreDependencies is not None:
self.dependencies.extend(moreDependencies)
def getDependencies(self):
return self.dependencies
def getAlertId(self):
return self.alertId
def hasBeenProcessed(self):
return self.beenProcessed
def visit(self):
self.beenProcessed = True
def md5(fname):
"""Calculates md5 hash of a filename"""
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_healthy_nodes_and_index(consul_url, hostname, logger):
"""Find AOM healthy nodes on consult"""
try:
# getting all registered nodes from consul
r = requests.get(
consul_url +
'/v1/catalog/service/alert-on-metrics',
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
value = json.loads(r.text)
node_list = []
host_index = -1
for elem in value:
node_list.append(elem.get('Node'))
# Retrieving healthy nodes
healthy_nodes = []
for node in node_list:
r2 = requests.get(
consul_url +
'/v1/health/node/' +
node,
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul health"
healthcheck_list = json.loads(r2.text)
for check in healthcheck_list:
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
check.get('Status') == 'passing'):
healthy_nodes.append(node)
try:
healthy_nodes.sort()
host_index = healthy_nodes.index(hostname)
except ValueError:
logger.error("Host is not healthy")
except TimeoutError:
logger.error("Timed out connecting to Consul")
return host_index, len(healthy_nodes)
def distribute_configs(
filename,
host_index,
module,
logger):
"""Uses md5 of alert config to split the files among healthy servers"""
if module == 0:
logger.error("No healthy nodes for the service")
return False
if host_index == -1:
logger.error("Host is unhealthy")
return False
if int(md5(filename), 16) % module == host_index:
return True
return False
def is_valid(alert_config, logger):
"""Checks if alert has all required fields"""
try:
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
assert alert_config['query'], "No Query, this is a dead config"
#assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
assert alert_config['id'], "Alert ID is empty, this is a dead config"
if DEPENDENCIES_KEY in alert_config:
assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
if alert_config.get('query_type') == 'prometheus':
assert isinstance(
alert_config['query'], str), "Invalid Prometheus query"
else:
assert isinstance(
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
{'', 'dc', 'fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in alert_config['query']['metrics'][0]:
defined_tags.update(
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
# AFTER CRITICAL
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'lookup' in alert_config['alerts']:
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
assert all(
isinstance(
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
# if 'occurrences_threshold' in alert_config:
# assert alert_config['occurrences_threshold'] >= 1, \
# "Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
logger.warning("Invalid config file: {}".format(str(e)))
return False
return True
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
"""Check if routing lookup is properly configured"""
try:
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
for alert_routing in alert_routing_lookup:
assert 'alert' in alert_routing, "No alert defined for this configuration."
assert 'tags' in alert_routing, "No tags value defined for this configuration."
for tag in alert_routing['tags']:
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
tag)
assert all(isinstance(tag, str)
for tag in alert_routing['tags']), "Tags must be valid string"
except AssertionError as e:
logger.warning("Invalid alert routing config file: {}".format(str(e)))
return False
return True
# noinspection PyBroadException
def glob_the_configs(
config_path,
lookup_config_path,
consul_url,
hostname,
logger):
"""
Args:
config_path (string): relative path to the configs
consul_url (string): url to consul service
logger:
Returns:
List of configs
"""
invalid_configs = 0
alert_list = []
host_index, module = get_healthy_nodes_and_index(
consul_url, hostname, logger)
alertToAlertWithDependencies = {}
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
if distribute_configs(
config_file,
host_index,
module,
logger):
try:
alert = yaml.safe_load(open(config_file, 'rb').read())
if is_valid(alert, logger):
if 'lookup' in alert['alerts']:
alert_routing_lookup = []
is_valid_lookup = True
if 'lookup_file' in alert['alerts']['lookup']:
lookup_path = "{}/{}".format(
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
if os.path.isfile(lookup_path):
alert_routing_lookup = yaml.safe_load(
open(lookup_path, 'rb').read())
else:
is_valid_lookup = False
else:
alert_routing_lookup = alert['alerts']['lookup']['lookups']
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
alert_routing_lookup, alert, logger)
if is_valid_lookup:
alerts_per_tags = {}
for alert_configuration in alert_routing_lookup:
key = []
for tag in alert['alerts']['lookup']['tags']:
key.append(
alert_configuration['tags'].get(tag))
alerts_per_tags[tuple(
key)] = alert_configuration['alert']
alert['alert_routing_lookup'] = alerts_per_tags
else:
invalid_configs += 1
continue
alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
alertToAlertWithDependencies[alert['id']] = alertWithDependencies
alert['resolvedDependencies'] = alertWithDependencies
alert_list.append(alert)
else:
invalid_configs += 1
except BaseException:
logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
# validate the dependencies and flesh out the dependency graphs
logger.debug("Iterating over dependencies")
for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)
logger.info("Invalid configs: {}".format(invalid_configs))
from serviceapp import service
service.send_stat(
'invalid_configs',
invalid_configs,
dict(),
statprefix='aom')
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list
def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
if len(alertWithDependencies.getDependencies()) > 0:
if not alertWithDependencies.hasBeenProcessed():
alertWithDependencies.visit()
dependencies = list(alertWithDependencies.getDependencies())
for dependentId in dependencies:
if dependentId not in allAlerts:
logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
else:
alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
return alertWithDependencies.getDependencies()
else:
return None

View File

@@ -0,0 +1,122 @@
# logger.py
""" Logging configuration """
import logging
import logging.handlers
import os
logging.getLogger('requests').setLevel(logging.ERROR)
logging.getLogger('urllib3').setLevel(logging.ERROR)
logging.getLogger('werkzeug').setLevel(logging.ERROR)
class SingleLevelFilter(logging.Filter):
def __init__(self, passlevel, reject):
"""
initilizer(constructor) of the singlelevelfilter
@param passlevel (int) - the int value of the level of the log
@param reject (bool) - if true will return if the record level is
not equal to the passlevel
@return SingleLevelFilter object
@note Sets some object parameters
"""
self.passlevel = passlevel
self.reject = reject
def filter(self, record):
"""
Returns True/False depending on parameters
@param record (Log int) - the record that the filter belongs to
@return bool - True/False depending on what self.reject is set to and
what record.levelno and self.passlevel are set to
@note This causes either only logging of the exact same level to get
logged, or only logging other than the same level to get logged
"""
if self.reject:
return record.levelno != self.passlevel
return record.levelno == self.passlevel
class AlertLogging(logging.Logger):
"""
Class Object to handle the logging of the alert on metrics service
starts at Error level and can flip on (and add) an additional log file and
Debug logger as needed.
"""
def __init__(self, name):
"""
Inits the formaters and logger
"""
self.name = name
self.debug_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
"%(message)s", "%m-%d %H:%M:%S")
self.standard_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - %(message)s", "%m-%d %H:%M:%S")
logging.getLogger()
logging.Logger.__init__(self, name, logging.DEBUG)
logging.setLoggerClass(AlertLogging)
def start(self):
"""
Returns:
"""
info_handler = logging.StreamHandler()
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(self.standard_formatter)
self.addHandler(info_handler)
return self
def start_log_file(self, file_path, mode='a'):
"""
Creates a separate log file handler
Args:
file_path: path to the log file
mode: the type of mode to open the file handler with
Returns:
"""
self.log_path = file_path
work_folder = os.path.dirname(file_path)
if work_folder and not os.path.exists(work_folder):
os.makedirs(work_folder)
self.log_handler = logging.FileHandler(file_path, mode)
self.log_handler.setLevel(logging.WARNING)
self.log_handler.setFormatter(self.debug_formatter)
self.addHandler(self.log_handler)
def stop_log_file(self):
"""
Closes Log file and sets the handler to None
Returns:
"""
self.log_handler.close()
self.removeHandler(self.log_handler)
self.log_handler = None
def start_debug(self):
"""
Returns:
"""
self.debug_handler = logging.StreamHandler()
self.debug_handler.setLevel(logging.DEBUG)
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
self.debug_handler.setFormatter(self.debug_formatter)
self.addHandler(self.debug_handler)
def stop_debug(self):
"""
stop the debugger
Returns:
"""
self.removeHandler(self.debug_handler)
self.debug_handler = None

View File

@@ -0,0 +1,83 @@
from datetime import datetime, timedelta
from urllib.parse import urljoin
import requests
class PromAPI:
def __init__(self, endpoint='http://127.0.0.1:9090/'):
"""
:param endpoint: address of
"""
self.endpoint = endpoint
@staticmethod
def _to_timestamp(input_):
"""
Convert string input to UNIX timestamp for Prometheus
:param input_:
:return:
"""
if type(input_) == datetime:
return input_.timestamp()
if input_ == 'now':
return datetime.utcnow().isoformat('T')
if type(input_) is str:
input_ = float(input_)
if type(input_) in [int, float]:
if input_ > 0:
return input_
if input_ == 0: # return now
return datetime.utcnow().isoformat('T')
if input_ < 0:
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
#assert type(input_) == float
def query(self, query='prometheus_build_info'):
return self._get(
uri='/api/v1/query',
params=dict(
query=query
)
)
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
"""Get ser"""
params = {
'query': query
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
if duration:
params['step'] = duration
print(params)
return self._get(
uri='/api/v1/query_range',
params=params
)
def series(self, match='prometheus_build_info', start=-86400, end='now'):
"""Get ser"""
params = {
'match[]': match
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
print(params)
return self._get(
uri='/api/v1/series',
params=params
)
def _get(self, uri, params, method='GET'):
url = urljoin(self.endpoint, uri)
assert method == 'GET'
result = requests.get(
url=url,
params=params
)
return result.json()

View File

@@ -0,0 +1,47 @@
import unittest
import config
class TestAlertWithDependencies(unittest.TestCase) :
def test_base(self) :
self.alertToAlertWithDependencies = {}
self.alert_list = []
self.make_alert("A", ["C"])
self.make_alert("B", ["C"])
self.make_alert("C", ["D"])
self.make_alert("D", None)
self.validate()
self.checkDepLen("A", 2)
self.checkDepLen("B", 2)
self.checkDepLen("C", 1)
self.checkDepLen("D", 0)
def make_alert(self, id, depends) :
alert = {
'id': id,
'dependencies': depends
}
alertWithDependencies = config.AlertWithDependencies(alert['id'], alert[config.DEPENDENCIES_KEY] if config.DEPENDENCIES_KEY in alert else None)
self.alertToAlertWithDependencies[alert['id']] = alertWithDependencies
alert['resolvedDependencies'] = alertWithDependencies
self.alert_list.append(alert)
def validate(self) :
for id, awd in self.alertToAlertWithDependencies.items() :
config.validateDependencies(id, awd, self.alertToAlertWithDependencies, MockLogger())
def checkDepLen(self, id, n) :
dep = self.alertToAlertWithDependencies[id]
self.assertEqual(len(dep.getDependencies()), n)
class MockLogger() :
def __init__(self) :
return
def info(self, *args, **kwargs) :
return
def debug(self, *args, **kwargs) :
return
def error(self, *args, **kwargs) :
return
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,42 @@
#!/bin/bash
GIT_COMMIT=$(git rev-parse HEAD)
if [[ $GIT_COMMIT == "" ]]; then
echo "--Missing required GIT_COMMIT var. Aborting..."
exit 1
fi
#Setup useful vars
team="engvis"
app="alert-on-metrics-app"
registryV2="registry-app.eng.qops.net:5001"
pathV2="${registryV2}/${team}/${app}"
commitV2="${pathV2}:${GIT_COMMIT}"
latestV2="${pathV2}:latest"
# In case you use relative paths
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
cd $DIR
echo "--Publishing $app $GIT_COMMIT"
echo "--Removing old image, so they don't accumulate"
docker rmi $latestV2
#Now fail if anything doesn't work
set -e
if [ -f $app/build.sh ]
then
echo "--Running pre build steps"
$app/build.sh
fi
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
echo "--Publishing app container"
docker push $commitV2
docker push $latestV2

View File

@@ -0,0 +1,48 @@
function _get_and_save_secret() {
function is_set() {
local name="$1"
eval "echo \$$name" | grep . > /dev/null
}
local name="$1"
eval "$name=\${$name:-}"
if ! is_set $name; then
eval "$name=$(security find-generic-password -a $USER -s $name -w 2> /dev/null)"
if ! is_set "$name"; then
eval "read -s -p 'Enter $name: ' $name" >&2
eval "security add-generic-password -a $USER -s $name -w \$$name" >&2
echo "" >&2
fi
fi
eval "echo \$$name"
}
function get_and_save_secret() {
_get_and_save_secret "$@" | tail -n 1
}
SENSU_API_USER="$(get_and_save_secret SENSU_API_USER)"
SENSU_API_PASS="$(get_and_save_secret SENSU_API_PASS)"
SLACK_API_TOKEN="$(get_and_save_secret SLACK_API_TOKEN)"
echo SENSU_USER=$SENSU_API_USER >&2
echo SENSU_PASS=$SENSU_API_PASS >&2
echo SLACK_TOKEN=$SLACK_API_TOKEN >&2
git submodule update --remote
rm -rf alert_configs
cp -r AoM_Configs/alert_configs .
docker build -t aom:dev .
docker rm -f aom
docker run \
-e SLACK_API_TOKEN=${SLACK_API_TOKEN} \
-e API_USER=$SENSU_API_USER \
-e API_PASS=$SENSU_API_PASS \
--rm \
-d \
-p 8080:8080 \
--add-host telegraf:10.4.13.53 \
--name aom \
--add-host consul.service.consul:127.0.0.1 \
-h 127.0.0.1 \
aom:dev &
until curl localhost:8080/healthcheck; do sleep 1; done
docker logs -f aom

View File

@@ -0,0 +1,14 @@
import os
import logging
from receiver import SlackReceiver
from sender import SlackSender
if __name__ == "__main__":
log = logging.getLogger()
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())
slack_token = os.environ["SLACK_API_TOKEN"]
sender = SlackSender(slack_token, log)
receiver = SlackReceiver(slack_token, log, sender.respond)
receiver.start()

View File

@@ -0,0 +1,12 @@
class SlackMessage() :
def __init__(self, payload) :
data = payload['data']
for i in data :
try :
setattr(self, i, getattr(data, i))
except Exception :
setattr(self, i, data[i])
attrs = dir(self)
assert "text" in attrs, "no text in message"
assert "user" in attrs, "no user in message"
assert "channel" in attrs, "no channel in message"

View File

@@ -0,0 +1,52 @@
import slack
import os
import re
import ssl as ssl_lib
import certifi
from message import SlackMessage
class SlackReceiver() :
def __init__(self, token, log, callback) :
self.token = token
self.ssl_context = ssl_lib.create_default_context(cafile=certifi.where())
self.callback = callback
self.log = log
def start(self) :
self.rtm_client = slack.RTMClient(token=self.token, ssl=self.ssl_context)
@slack.RTMClient.run_on(event="message")
def receive(**payload) :
msg = self.parse(payload)
if msg is not None:
self.receive(msg)
print("Starting")
self.rtm_client.start()
def parse(self, payload) :
self.log.debug("slack message received: {}".format(payload))
if 'data' in payload and 'bot_id' in payload['data'] and payload['data']['bot_id'] == 'BNYAX72BB':
# it's the bot's response, ignore it
return None
if 'data' in payload and 'user' in payload['data'] and payload['data']['user'] == 'UNS0QKMMY':
# it's the bot uploading files, ignore it
return None
if 'data' in payload and (
('text' in payload['data'] and '<@UNS0QKMMY>' not in payload['data']['text'])
and ('channel' in payload['data'] and not payload['data']['channel'].startswith('DP'))
):
# message in a channel and the bot wasn't pinged, or was not a direct message - ignore
self.log.debug("received message, but I wasn't pinged or DM'ed")
return None
if 'data' in payload and 'text' in payload['data'] and '<@UNS0QKMMY>' in payload['data']['text']:
# remove the ping text
payload['data']['text'] = re.sub('\\<@UNS0QKMMY\\>', '', payload['data']['text'])
return SlackMessage(payload)
def receive(self, msg) :
text = msg.text.split()
channel = msg.channel
ID = text[0] if len(text) > 0 else None
interval = text[1] if len(text) > 1 else None
step = text[2] if len(text) > 2 else None
self.callback(channel, ID, interval, step)

View File

@@ -0,0 +1,131 @@
import os
import sys
import requests
import traceback
import time
import io
import binascii
import logging
import matplotlib.pyplot as plt
import numpy as np
import datetime
import re
rootDirectory = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(rootDirectory)
from library.config import glob_the_configs
from library.prom_api import PromAPI
HOSTNAME = None
class SlackSender:
def __init__(self, token, log):
self.token = token
self.alertList = []
self.log = log
try:
self.alertList = glob_the_configs(rootDirectory, \
rootDirectory + "/AoM_Configs/alert_routing_lookup", \
'http://consul.service.consul:8500', '127.0.0.1', log)
except Exception:
log.error("Failed to load config files: {}".format(traceback.format_exc()))
def respond(self, channel, alertId, interval, step):
self.log.debug("incomming message: channel: {} alert ID: {} interval: {} step: {}".format(channel, alertId, interval, step))
matchingAlert = next((alert for alert in self.alertList if alert['id'] == alertId), None)
if not matchingAlert is None:
query_args = {
'interval' : matchingAlert['interval'],
'start_time' : matchingAlert['start_time'],
'end_time' : matchingAlert['end_time'],
'query' : matchingAlert['query'],
}
prom_api = PromAPI(endpoint=matchingAlert['prometheus_url'])
if interval :
try:
dur = parse_go_duration(interval)
if dur < 60 :
dur = 60
end = 0
start = -1 * dur
query_args['start_time'] = start
query_args['end_time'] = end
except Exception:
pass
if step :
query_args['interval'] = step
self.log.debug("QUERY_ARGS {} FROM {} {}".format(query_args, interval, step))
ret = prom_api.query_range(
query=query_args['query'],
start=query_args['start_time'],
end=query_args['end_time'],
duration=query_args['interval'])
if 'status' in ret and ret['status'] == 'success' and 'data' in ret and 'result' in ret['data'] and len(ret['data']['result']) > 0 and 'values' in ret['data']['result'][0] and ret['data']['result'][0]['values'] is not None and len(ret['data']['result'][0]['values']) > 0:
resultsForGraph = {}
for row in ret['data']['result'][0]['values']:
resultsForGraph[row[0]] = row[1]
finalResults = {}
for res in resultsForGraph:
finalResults[time.strftime('%H:%M:%S', time.localtime(res))] = float(resultsForGraph[res])
plt.clf()
plt.plot(list(finalResults.keys()), list(finalResults.values()))
plt.suptitle(alertId + " (all times UTC)")
if len(finalResults.keys()) > 5:
tickTuples = [(index, x) for index, x in enumerate(finalResults.keys()) if index % int(len(finalResults.keys()) / 5) == 0]
tickList = []
for pair in tickTuples:
tickList.append(pair[1])
plt.xticks(ticks = tickList, rotation='vertical')
else:
plt.xticks(rotation='vertical')
plt.ylim(bottom = 0)
plt.subplots_adjust(bottom=0.2)
pngData = io.BytesIO()
fig = plt.gcf()
fig.savefig(pngData, format = 'png')
self.sendGraph(channel, pngData)
else:
self.log.debug("didn't meet criteria")
self.sendQueryResults(channel, ret)
else:
self.sendQueryResults(channel, "Sorry, I couldn't find a matching alert with ID {}".format(alertId))
def sendQueryResults(self, channelId, queryResults):
response = requests.post('https://slack.com/api/chat.postMessage',
headers = {
'Authorization': "Bearer " + self.token,
'Content-Type': 'application/json; charset=utf-8'
},
json = { 'text': queryResults, 'channel': channelId }
)
self.log.debug("slack response: {}".format(response.text))
def sendGraph(self, channelId, rawData):
request = requests.Request('POST', 'https://slack.com/api/files.upload',
data = { 'token': self.token, 'filetype': 'png', 'channels': channelId },
files = { 'file': ('graph.png', rawData.getvalue(), 'image/png')}
).prepare()
self.log.debug("headers to send to Slack: {}".format('\r\n'.join('{}: {}'.format(k, v) for k, v in request.headers.items())))
self.log.debug("body to send to Slack: {}".format(len(request.body)))
response = requests.Session().send(request)
self.log.debug("slack response: {}".format(response.text))
def setAlertList(self, newAlertList):
# TODO can rework to be a dictionary for faster lookup if necessary
self.alertList = newAlertList
def parse_go_duration(duration) :
duration = str(duration)
e = Exception("invalid duration "+duration)
if not re.match("^[0-9][0-9]*[a-z]$", duration) :
raise e
unit = duration[-1:]
n = int(duration[:-1])
if unit == "s" :
return n
if unit == "m" :
return 60*n
if unit == "h" :
return 60*60*n
raise e

View File

@@ -0,0 +1,14 @@
#! /bin/bash
SLACK_API_TOKEN=${SLACK_API_TOKEN:-}
if [ -z "$SLACK_API_TOKEN" ]; then
SLACK_API_TOKEN=$(security find-generic-password -a $USER -s SLACK_API_TOKEN -w 2> /dev/null)
if [ -z "$SLACK_API_TOKEN" ]; then
read -s -p "Enter SLACK_API_TOKEN" SLACK_API_TOKEN
echo ""
fi
fi
security delete-generic-password -a $USER -s SLACK_API_TOKEN 2> /dev/null 1>&2
security add-generic-password -a $USER -s SLACK_API_TOKEN -w $SLACK_API_TOKEN 1>&2
SLACK_BOT_TOKEN=$SLACK_API_TOKEN python3 ./main.py

View File

@@ -0,0 +1,16 @@
import unittest
from message import SlackMessage
class Test_SlackMessage(unittest.TestCase) :
def test_basic(self) :
def recv(*args, **payload) :
return SlackMessage(payload)
try :
recv(data={"hello":"world"})
self.fail("did not raise exception at bad slack message payload")
except Exception :
pass
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
from receiver import SlackReceiver
from message import SlackMessage
class Test_SlackReceiver(unittest.TestCase) :
def test_basic(self) :
r = SlackReceiver("token", print)
msg = r.parse(data={"user":"u", "text":"text", "channel":"channel"})
self.assertTrue(isinstance(msg, SlackMessage))
r.receive(msg)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,35 @@
import unittest
import sender
class Test_Compute_Relative_Time(unittest.TestCase) :
def test_basic(self) :
class MockDateTime() :
t = 1000000
def __init__(self) :
self.datetime = self
def fromtimestamp(self, t) :
self.t = t
def timestamp(self) :
return self.t
def utcnow(self) :
return self
mock_date_time = MockDateTime()
was = sender.datetime
sender.datetime = mock_date_time
self.case("1s", 1)
self.case("5s", 5)
self.case("0s", 0)
self.case("0m", 0)
self.case("1m", 60)
self.case("9m", 9*60)
self.case("0h", 0)
self.case("1h", 60*60)
self.case("9h", 9*60*60)
sender.datetime = was
def case(self, duration, expected) :
seconds = sender.parse_go_duration(duration)
self.assertEqual(seconds, expected)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,11 @@
PyYAML
pip
setuptools
requests
pyaml
sanic
statsd-tags
redis
certifi
slackclient
matplotlib

View File

@@ -0,0 +1,62 @@
#!/bin/ash
(
while true; do
redis-server
sleep 10
done
) &
/usr/src/app/echo-server &
/usr/src/app/echo-server -p 443 &
/usr/src/app/consul &
# Default values
KAIROSDB_URL=${KAIROSDB_URL:-http://kairosdb-metrics.service.eng.consul:8080/}
SMTP_SERVER=${SMTP_SERVER:-internal-smtp1-app.eng.qops.net:2525}
SENSU_URL=${SENSU_URL:-https://sensu-api.eng.qops.net:443/results}
#SLACK_TOKEN=${SLACK_TOKEN:-xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81}
#VICTOROPS_URL=${VICTOROPS_URL:-https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/}
#CONSUL_URL=${CONSUL_URL:-http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock}
#AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-https://grafana.eng.qops.net/d/000000113/alert-on-metrics?refresh=1m&orgId=1&var-dc=All&var-fqdn=All&from=now-6h&to=now&var-id=}
#UCHIWA_URL=${UCHIWA_URL:-https://uchiwa-app.eng.qops.net/#/client/EngOps/AOM}
SLACK_TOKEN=${SLACK_TOKEN:-na}
VICTOROPS_URL=${VICTOROPS_URL:-http://localhost:41912/}
CONSUL_URL=${CONSUL_URL:-http://localhost:41912/}
AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-http://localhost:41912/}
UCHIWA_URL=${UCHIWA_URL:-http://localhost:41912/}
export AOM_GRAFANA_URL
# Update config
sed -i "s#{{{KAIROSDB_URL}}}#${KAIROSDB_URL}#g" service.yaml
sed -i "s#{{{VICTOROPS_URL}}}#${VICTOROPS_URL}#g" service.yaml
sed -i "s#{{{SLACK_TOKEN}}}#${SLACK_TOKEN}#g" service.yaml
sed -i "s#{{{SMTP_SERVER}}}#${SMTP_SERVER}#g" service.yaml
sed -i "s#{{{CONSUL_URL}}}#${CONSUL_URL}#g" service.yaml
sed -i "s#{{{SENSU_URL}}}#${SENSU_URL}#g" service.yaml
sed -i "s,{{{UCHIWA_URL}}},${UCHIWA_URL},g" service.yaml
# Starting service
if [ -n "${TEST}" ]; then
sed -i '/alert_reload_interval:/ s/[0-9]\+/30/g' service.yaml
python3 /usr/src/app/aom_service.py &
sleep 17
echo "Making current server leader"
curl localhost:8080/override?enable=true
echo "Starting the service"
curl localhost:8080/healthcheck
exec python3 /usr/src/app/aom_test.py
if [ $? -ne 0 ]; then
cat /usr/src/app/logs/aom_service.log
echo "Test failed!"
exit 1
else
cat /usr/src/app/logs/aom_service.log
echo "Test succeeded. Exiting"
exit 0
fi
else
exec python3 /usr/src/app/reporter/incoming/main.py &
exec python3 /usr/src/app/aom_service.py
fi

View File

@@ -0,0 +1,27 @@
#=======================#
# All them URLS and tokens
#=======================#
kairosdb_url: "{{{KAIROSDB_URL}}}"
victorops_url: "{{{VICTOROPS_URL}}}"
slack_url: "https://slack.com/api/chat.postMessage"
slack_token: "{{{SLACK_TOKEN}}}"
smtp_server: "{{{SMTP_SERVER}}}"
consul_url: "{{{CONSUL_URL}}}"
sensu_endpoint: "{{{SENSU_URL}}}"
uchiwa_url: "{{{UCHIWA_URL}}}"
#=======================#
# Logging Information
#=======================#
log_path: "logs/aom_service.log"
#=======================#
# alerts configurations
#=======================#
alert_folder: "alert_configs"
alert_routing_lookup: "alert_routing_lookup"
alert_reload_interval: 300
#=======================#
# request timeout value
#=======================#
timeout: 90

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,99 @@
import unittest
import service
class TestMockRedis(unittest.TestCase) :
def test_base(self) :
from redis import Redis
r = service.get_redis_client()
self.assertTrue(not isinstance(r, Redis))
self.assertEqual(r.get("a"), None)
self.assertEqual(r.set("a", "b"), None)
self.assertEqual(r.call("KEYS", "b*"), [])
def test_test(self) :
r = TestRedis()
r.set("a", "b")
self.assertEqual(r.get("a"), "b")
self.assertEqual(r.call("KEYS", "a*"), ["a"])
DB = None
class TestRedis(service.MockRedis) :
def __init__(self) :
global DB
if DB is None :
DB = {}
def get(self, key) :
global DB
return DB[key] if key in DB else None
def delete(self, key) :
global DB
if key in DB :
del(DB[key])
def set(self, key, value) :
global DB
DB[key] = value
def call(self, cmd, arg) :
global DB
if not cmd is "KEYS" :
return None
return [i for i in DB.keys() if i.startswith(arg.strip("*"))]
class TestSetFiring(unittest.TestCase) :
def test_base(self) :
def mock_get_redis_client() :
return TestRedis()
service.get_redis_client = mock_get_redis_client
service.set_firing("TestSetFiring", [
[0, 0, 0, {"dc": "there"}],
[0, 0, 0, {"dc": "here"}],
])
self.assertEqual(len(service.list_firing("TestSetFiring")), 2)
service.set_firing("TestSetFiring", [
[0, 0, 0, {"dc": "here"}],
])
self.assertEqual(len(service.list_firing("TestSetFiring")), 1)
class MockResolveDep():
def __init__(self, l) :
self.l = l
def getDependencies(self) :
return self.l
class TestIsSuppressed(unittest.TestCase) :
def test_base(self) :
def mock_get_redis_client() :
return TestRedis()
service.get_redis_client = mock_get_redis_client
alert_config = {
'id': "TestIsSuppressed",
'resolvedDependencies': MockResolveDep(["TestIsSuppressedD", "b", "c"]),
'suppressed_occurrences_threshold': 2,
}
alert_tags = {"dc":"z", "x":"y"}
# dependency fires one alert, suppress in effect
service.set_firing("TestIsSuppressedD", [[0, 0, 0, alert_tags]])
service.clear_suppressed(alert_config, alert_tags)
self.assertTrue(service.is_suppressed(alert_config, alert_tags))
# dependency still firing alert, suppress stops
service.set_firing("TestIsSuppressedD", [[0, 0, 0, alert_tags]])
service.clear_suppressed(alert_config, alert_tags)
self.assertFalse(service.is_suppressed(alert_config, alert_tags))
# dependency in different dc fires alert, suppress in DC1 stops, suppress in DC2 starts
new_alert_tags = {"dc":"w"}
service.set_firing("TestIsSuppressedD", [[0, 0, 0, new_alert_tags]])
service.clear_suppressed(alert_config, new_alert_tags)
self.assertFalse(service.is_suppressed(alert_config, alert_tags))
self.assertTrue(service.is_suppressed(alert_config, new_alert_tags))
# dependencies clear everywhere, suppress stops everywhere
service.set_firing("TestIsSuppressedD", [])
service.clear_suppressed(alert_config, [])
self.assertFalse(service.is_suppressed(alert_config, alert_tags))
self.assertFalse(service.is_suppressed(alert_config, new_alert_tags))
if __name__ == "__main__" :
unittest.main()