cold
This commit is contained in:
15
sleeper_agents_aom_engine/.gitignore
vendored
Executable file
15
sleeper_agents_aom_engine/.gitignore
vendored
Executable file
@@ -0,0 +1,15 @@
|
||||
# Created by .ignore support plugin (hsz.mobi)
|
||||
### Vagrant template
|
||||
.vagrant/
|
||||
.idea/
|
||||
build/results
|
||||
logs/
|
||||
|
||||
*.pyc
|
||||
.dockerignore
|
||||
Dockerfile
|
||||
build/builder
|
||||
site-packages.tar.gz
|
||||
|
||||
alert_configs
|
||||
AoM_Configs
|
||||
3
sleeper_agents_aom_engine/.gitmodules
vendored
Executable file
3
sleeper_agents_aom_engine/.gitmodules
vendored
Executable file
@@ -0,0 +1,3 @@
|
||||
[submodule "AlertOnMetrics"]
|
||||
path = AoM_Configs
|
||||
url = ssh://git@gitlab-app.eng.qops.net:10022/sleeper-agents/AlertOnMetrics.git
|
||||
67
sleeper_agents_aom_engine/.jenkins/JenkinsFile
Executable file
67
sleeper_agents_aom_engine/.jenkins/JenkinsFile
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env groovy
|
||||
pipeline {
|
||||
agent {label 'nomad-builder'}
|
||||
|
||||
environment {
|
||||
DOCKER_HOST = '127.0.0.1:2375'
|
||||
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
|
||||
}
|
||||
stages {
|
||||
stage('Info') {
|
||||
steps {
|
||||
sh script: 'hostname'
|
||||
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
|
||||
}
|
||||
}
|
||||
stage('Build') {
|
||||
steps {
|
||||
echo "No build required"
|
||||
}
|
||||
}
|
||||
stage('Test') {
|
||||
steps {
|
||||
echo "Test done during merge request"
|
||||
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
|
||||
}
|
||||
}
|
||||
stage('Deploy') {
|
||||
steps {
|
||||
script {
|
||||
if ("$GIT_BRANCH" == "origin/master"){
|
||||
echo "Running publish script"
|
||||
sh script: './publish.sh'
|
||||
echo "Triggering Rundeck job"
|
||||
script {
|
||||
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c5323400-0d97-4488-8cf2-1d736a5f7fb9', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
|
||||
}
|
||||
}
|
||||
else {
|
||||
echo "No deploy step required."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
post {
|
||||
success {
|
||||
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||
// Test passed, update commit with green checkbox
|
||||
}
|
||||
// Notify Eng Viz of successful build
|
||||
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
|
||||
}
|
||||
failure {
|
||||
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||
// Test failed, update commit status with red x
|
||||
error("Build failed, check ${BUILD_URL} for details.")
|
||||
}
|
||||
// On failure send an email to Eng Vis
|
||||
mail body: 'Please check ${BUILD_URL} or details.',
|
||||
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
|
||||
from: 'Jenkins',
|
||||
to: 'eng-visibility@qualtrics.com'
|
||||
// Finally send a warning message to Eng Vis slack channel
|
||||
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
|
||||
}
|
||||
}
|
||||
}
|
||||
58
sleeper_agents_aom_engine/.jenkins/JenkinsFileMR
Executable file
58
sleeper_agents_aom_engine/.jenkins/JenkinsFileMR
Executable file
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env groovy
|
||||
pipeline {
|
||||
agent {label 'nomad-builder'}
|
||||
|
||||
environment {
|
||||
DOCKER_HOST = '127.0.0.1:2375'
|
||||
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
|
||||
}
|
||||
stages {
|
||||
stage('Info') {
|
||||
steps {
|
||||
sh script: 'hostname'
|
||||
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
|
||||
}
|
||||
}
|
||||
stage('Build') {
|
||||
steps {
|
||||
echo "Building AOM container"
|
||||
sh script: 'docker build . -t aom_test_container'
|
||||
}
|
||||
}
|
||||
stage('Test') {
|
||||
steps {
|
||||
echo "Launching container on test mode. It will take a few minutes."
|
||||
sh script: 'docker run -e TEST=true -h $(hostname) --add-host=\"telegraf:$(nslookup jenkins.eng.qops.net|grep Server | awk \'{print $2}\')\" aom_test_container'
|
||||
echo "Removing docker image and container"
|
||||
sh script: 'docker rmi -f aom_test_container'
|
||||
}
|
||||
}
|
||||
stage('Deploy') {
|
||||
steps {
|
||||
echo "No deploy step required for Merge Request"
|
||||
}
|
||||
}
|
||||
}
|
||||
post {
|
||||
success {
|
||||
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||
// Test passed, update commit with green checkbox
|
||||
}
|
||||
// Notify Eng Viz of successful build
|
||||
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
|
||||
}
|
||||
failure {
|
||||
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||
// Test failed, update commit status with red x
|
||||
error("Build failed, check ${BUILD_URL} for details.")
|
||||
}
|
||||
// On failure send an email to Eng Vis
|
||||
mail body: 'Please check ${BUILD_URL} or details.',
|
||||
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
|
||||
from: 'Jenkins',
|
||||
to: 'eng-visibility@qualtrics.com'
|
||||
// Finally send a warning message to Eng Vis slack channel
|
||||
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
|
||||
}
|
||||
}
|
||||
}
|
||||
8
sleeper_agents_aom_engine/README.md
Executable file
8
sleeper_agents_aom_engine/README.md
Executable file
@@ -0,0 +1,8 @@
|
||||
# IMPORTANT NOTICE:
|
||||
|
||||
Alert configurations have been moved to [AlertOnMetrics]
|
||||
(https://gitlab-app.eng.qops.net/engvis/AlertOnMetricsConfigs).
|
||||
|
||||
This will allow more flexibility to the project. Merge requests will
|
||||
be automatically validated, merged and deployed if it passes the
|
||||
validation stage.
|
||||
240
sleeper_agents_aom_engine/aom_service.py
Executable file
240
sleeper_agents_aom_engine/aom_service.py
Executable file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/python3
|
||||
""" Alert On Metrics Project"""
|
||||
|
||||
import logging
|
||||
import multiprocessing
|
||||
import json
|
||||
import base64
|
||||
import os
|
||||
import subprocess
|
||||
from time import time, sleep
|
||||
import requests
|
||||
import yaml
|
||||
import traceback
|
||||
from sanic import Sanic, response
|
||||
from library.args import get_service_args
|
||||
from serviceapp import service
|
||||
from library.config import glob_the_configs
|
||||
from library.logger import AlertLogging
|
||||
|
||||
LOG = AlertLogging('aom')
|
||||
LOG.start()
|
||||
LOG.start_log_file("logs/aom_service.log")
|
||||
LOG.start_debug()
|
||||
|
||||
APP = Sanic()
|
||||
SERVICE_JOB = multiprocessing.Value('i', 0)
|
||||
NUM_JOBS = multiprocessing.Value('i', 0)
|
||||
LEADERSHIP = multiprocessing.Value('i', 0)
|
||||
LEADER_STATUS = None
|
||||
LEADER_TIME = None
|
||||
CONSUL_URL = None
|
||||
LEADER_OVERRIDE = None
|
||||
HOSTNAME = None
|
||||
SERVICE_CONFIG = None
|
||||
|
||||
# move to library
|
||||
def dict_compare(d1, d2):
|
||||
"""
|
||||
Function to compare two dictionaries
|
||||
"""
|
||||
d1_keys = set(d1.keys())
|
||||
d2_keys = set(d2.keys())
|
||||
intersect_keys = d1_keys.intersection(d2_keys)
|
||||
added = d1_keys - d2_keys
|
||||
removed = d2_keys - d1_keys
|
||||
modified = set(o for o in intersect_keys if d1[o] != d2[o])
|
||||
return added, removed, modified #, same
|
||||
|
||||
@APP.route("/")
|
||||
async def index(_):
|
||||
"""
|
||||
Return total number of jobs
|
||||
"""
|
||||
global NUM_JOBS
|
||||
return response.json({"job_count": NUM_JOBS.value})
|
||||
|
||||
@APP.route('/healthcheck')
|
||||
async def health(request):
|
||||
"""
|
||||
Flask healthcheck so that consul and friends work, see this as a service
|
||||
Returns:
|
||||
json object of status: ok
|
||||
"""
|
||||
LOG.debug("healthcheck")
|
||||
service_process = multiprocessing.Process(target=start_service, \
|
||||
args=(LOG, SERVICE_CONFIG['alert_reload_interval']), \
|
||||
name="service", daemon=False)
|
||||
# TRY TO START SERVICE, IF LEADER AND NOT RUNNING
|
||||
if SERVICE_JOB.value == 0:
|
||||
LOG.info("Starting alerts background job")
|
||||
SERVICE_JOB.value += 1
|
||||
service_process.start()#start_service(log)
|
||||
return response.json({"status": "ok"}, 200)
|
||||
|
||||
|
||||
# @APP.route("/override")
|
||||
# async def override(request):
|
||||
# """
|
||||
# Sets the LEADER_OVERRIDE global parameter to force an override
|
||||
# """
|
||||
# global LEADER_OVERRIDE
|
||||
# if request.args.get('enable') == 'true':
|
||||
# LEADER_OVERRIDE = True
|
||||
# elif request.args.get('enable') == 'false':
|
||||
# LEADER_OVERRIDE = False
|
||||
# return response.json({"override": LEADER_OVERRIDE})
|
||||
|
||||
|
||||
# def leader():
|
||||
# """
|
||||
# Needs to be implemented that goes out to consul and checks if node is leader,
|
||||
# or if there is no leader volunteers itself.
|
||||
# Returns:
|
||||
# bool of True or False.... once the logic gets worked out
|
||||
# """
|
||||
# global LEADER_STATUS, LEADER_TIME
|
||||
# # CHECK IF THERE IS AN ARGUMENT FOR OVERRIDING THE CHECK LEADER
|
||||
# if LEADER_OVERRIDE is True:
|
||||
# return True
|
||||
|
||||
# # CHECK IF LEADER_TIME IS SET AND THAT IT'S LESS THAN 30 SECONDS FROM LAST SET
|
||||
# if LEADER_TIME is None or time() - LEADER_TIME > 60:
|
||||
# LOG.info("Cache has expired or was not set")
|
||||
# box_hostname = os.environ['HOSTNAME'] if HOSTNAME is None else HOSTNAME
|
||||
# LOG.info("Getting Leader Election status")
|
||||
|
||||
# # RIGHT NOW IN THE CONFIG THIS IS HARD SET TO THE CONSUL1-APP.ENG.OPS.NET
|
||||
# try:
|
||||
# r = requests.get(CONSUL_URL, timeout=60)
|
||||
# assert r.status_code == 200, "Failed to get back a 200 from consul."
|
||||
# LOG.info("Verify that the Value is {}".format(box_hostname))
|
||||
|
||||
# # THE VALUE BACK IS A BASE64 ENCODED BYTE, THAT NEEDS TO BE DECODED,
|
||||
# # TURNED TO A STRING, THEN TO A DICT
|
||||
# value = json.loads(base64.b64decode(r.json()[0]['Value']).decode('utf-8'))
|
||||
|
||||
# # CACHE THE VALUE AND TIMESTAMP
|
||||
# if value['HostName'] == box_hostname:
|
||||
# LEADER_STATUS = True
|
||||
# LEADER_TIME = time()
|
||||
# return True
|
||||
# else:
|
||||
# return False
|
||||
# except TimeoutError:
|
||||
# LOG.error("Timed out connecting to Consul")
|
||||
# return LEADER_STATUS
|
||||
# else:
|
||||
# return LEADER_STATUS
|
||||
|
||||
|
||||
def start_service(log, reload_interval):
|
||||
"""
|
||||
Starts the service
|
||||
Args:
|
||||
None
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
jobs = []
|
||||
alert_list = []
|
||||
alert_hash = {}
|
||||
box_hostname = os.environ['HOSTNAME'] if HOSTNAME is None else HOSTNAME
|
||||
production_mode = not "TEST" in os.environ
|
||||
# WAIT FOR LEADER ELECTION TO PASS
|
||||
# while not leader():
|
||||
# return False
|
||||
# # GLOB ALL THE CONFIG FILES TO BUILD POOL OF ALERTS
|
||||
log.info("Waiting 15s for Consul service to pass")
|
||||
#sleep(15)
|
||||
while True:
|
||||
try:
|
||||
alert_list = glob_the_configs(SERVICE_CONFIG['alert_folder'], \
|
||||
SERVICE_CONFIG['alert_routing_lookup'], \
|
||||
'http://consul.service.consul:8500', box_hostname, log)
|
||||
except Exception:
|
||||
log.error("Failed to load config files: {}".format(traceback.format_exc()))
|
||||
# CREATE THREAD POOL, TO PREVENT RECURSIVE INCLUSION WITH
|
||||
# MULTIPROCESSING, MOVED FUNCTION TO ANOTHER FILE
|
||||
log.info("Found {} alerts".format(len(alert_list)))
|
||||
new_alert_hash = {}
|
||||
for alert_config in alert_list:
|
||||
if alert_config['id'] in new_alert_hash.keys():
|
||||
log.info("Duplicate alert id found: {}. \
|
||||
Ignoring one of them.".format(alert_config['id']))
|
||||
else:
|
||||
new_alert_hash[alert_config['id']] = alert_config
|
||||
|
||||
added, removed, modified = dict_compare(new_alert_hash, alert_hash)
|
||||
log.info("Added alerts {}".format(added))
|
||||
log.info("Removed alerts {}".format(removed))
|
||||
log.info("Modified alerts {}".format(modified))
|
||||
# PROCESSES TO KILL
|
||||
for al_config in removed.union(modified):
|
||||
position = None
|
||||
# Find if process is currently running
|
||||
for i, job in enumerate(jobs):
|
||||
if job.name == al_config and job.is_alive():
|
||||
position = i
|
||||
# once found exit loop
|
||||
break
|
||||
# Terminate process and remove it from the list
|
||||
log.info("Stopping config: {}".format(jobs[position].name))
|
||||
subprocess.call(["/bin/kill", "-9", "{}".format(jobs[position].pid)])
|
||||
jobs[position].join()
|
||||
NUM_JOBS.value -= 1
|
||||
log.info("Process stopped succesfully")
|
||||
jobs.pop(position)
|
||||
# PROCESSES TO START
|
||||
alert_configurations = added.union(modified)
|
||||
alert_configurations = sorted(alert_configurations, key=lambda x:len(new_alert_hash[x].get('resolvedDependencies').getDependencies()))
|
||||
for al_config in added.union(modified):
|
||||
if new_alert_hash[al_config].get('query_type') == 'prometheus':
|
||||
p = multiprocessing.Process(target=service.check_prometheus_alert, \
|
||||
args=(new_alert_hash[al_config], SERVICE_CONFIG, log, production_mode), \
|
||||
name=al_config, daemon=True)
|
||||
else:
|
||||
p = multiprocessing.Process(target=service.check_kairosdb_alert,\
|
||||
args=(new_alert_hash[al_config], SERVICE_CONFIG, log, production_mode), \
|
||||
name=al_config, daemon=True)
|
||||
jobs.append(p)
|
||||
log.info("Starting new config: {}".format(p.name))
|
||||
p.start()
|
||||
NUM_JOBS.value += 1
|
||||
# store current list
|
||||
alert_hash = new_alert_hash.copy()
|
||||
log.info("Total number of jobs: {}".format(NUM_JOBS.value))
|
||||
service.send_stat('total_jobs', NUM_JOBS.value, dict(), statprefix='aom')
|
||||
if added:
|
||||
service.send_stat('new_jobs', len(added), dict(), statprefix='aom')
|
||||
if modified:
|
||||
service.send_stat('modified_jobs', len(modified), dict(), statprefix='aom')
|
||||
if removed:
|
||||
service.send_stat('removed_jobs', len(removed), dict(), statprefix='aom')
|
||||
sleep(reload_interval)
|
||||
#No longer leader killing all processes
|
||||
log.info("No longer leader. Exiting alerts background job")
|
||||
for job in jobs:
|
||||
# job.terminate() causes the server to stop
|
||||
subprocess.call(["/bin/kill", "-9", "{}".format(job.pid)])
|
||||
NUM_JOBS.value -= 1
|
||||
SERVICE_JOB.value = 0
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
# GET ARGS AND START LOGGING
|
||||
ARGS = get_service_args()
|
||||
logging.setLoggerClass(AlertLogging)
|
||||
LOG.info("Starting Service")
|
||||
# GET SERVICE CONFIG
|
||||
LEADER_OVERRIDE = ARGS['override']
|
||||
HOSTNAME = ARGS['hostname']
|
||||
SERVICE_CONFIG = yaml.safe_load(open('service.yaml', 'r').read())
|
||||
if ARGS['alert_configs'] is not None:
|
||||
SERVICE_CONFIG['alert_folder'] = ARGS['alert_configs']
|
||||
if ARGS['alert_routing_lookup'] is not None:
|
||||
SERVICE_CONFIG['alert_routing_lookup'] = ARGS['alert_routing_lookup']
|
||||
# SET CONSUL URL FOR LEADER CHECK
|
||||
CONSUL_URL = SERVICE_CONFIG['consul_url']
|
||||
# START THE MAIN SERVICE
|
||||
APP.run(host="0.0.0.0", port=ARGS['port'])
|
||||
121
sleeper_agents_aom_engine/aom_test.py
Executable file
121
sleeper_agents_aom_engine/aom_test.py
Executable file
@@ -0,0 +1,121 @@
|
||||
import json
|
||||
import time
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
||||
service_config = yaml.load(open('service.yaml', 'r').read())
|
||||
kairos_url = service_config['kairosdb_url'] + "api/v1/datapoints/"
|
||||
kairos_query = kairos_url + "query"
|
||||
metrics_list = []
|
||||
status1 = "RECOVERY"
|
||||
status2 = "WARNING"
|
||||
status3 = "CRITICAL"
|
||||
|
||||
json_string1 = """{"name": "aom_test_metric","datapoints": """
|
||||
json_string2 = ""","tags": {"host": "aom_host","data_center": "AOM"},"ttl": 500}"""
|
||||
|
||||
# WRITE ALERT CONFIG FILE
|
||||
|
||||
alert_file = {'alerts': {'sensu': {'slack': 'aom_test_channel'}},
|
||||
'critical_lower_threshold': 100,
|
||||
'critical_upper_threshold': 5000,
|
||||
'id': 'test_metric',
|
||||
'interval': 30,
|
||||
'occurrences_threshold': 1,
|
||||
'query': {'cache_time': 0,
|
||||
'end_relative': {'unit': 'seconds', 'value': '30'},
|
||||
'metrics': [{'name': 'aom_test_metric', 'tags': {}}],
|
||||
'start_relative': {'unit': 'seconds', 'value': '60'}},
|
||||
'tags': {},
|
||||
'url': 'AOM_TESTING',
|
||||
'warning_lower_threshold': 1000,
|
||||
'warning_upper_threshold': 2000}
|
||||
|
||||
query_intro = """{
|
||||
"metrics": [
|
||||
{
|
||||
"tags": {
|
||||
"alert": [
|
||||
"test_metric"
|
||||
]
|
||||
},
|
||||
"name": "telegraf.aom_"""
|
||||
|
||||
query_outro = """_value",
|
||||
"aggregators": [
|
||||
{
|
||||
"name": "sum",
|
||||
"align_sampling": true,
|
||||
"sampling": {
|
||||
"value": "9",
|
||||
"unit": "minutes"
|
||||
},
|
||||
"align_start_time": false
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"cache_time": 0,
|
||||
"start_relative": {
|
||||
"value": "8",
|
||||
"unit": "minutes"
|
||||
}
|
||||
}"""
|
||||
|
||||
|
||||
def main():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
with open('alert_configs/test.yaml', 'w') as yaml_file:
|
||||
yaml.dump(alert_file, yaml_file, default_flow_style=False)
|
||||
except Exception:
|
||||
print("Error writing alert config file")
|
||||
return False
|
||||
|
||||
now = int(time.time() * 1000)
|
||||
metrics_list.append([now, 1501])
|
||||
now += 32000
|
||||
metrics_list.append([now, 202])
|
||||
now += 32000
|
||||
metrics_list.append([now, 23])
|
||||
now += 32000
|
||||
metrics_list.append([now, 1504])
|
||||
now += 32000
|
||||
metrics_list.append([now, 2005])
|
||||
now += 32000
|
||||
metrics_list.append([now, 5006])
|
||||
now += 32000
|
||||
metrics_list.append([now, 1507])
|
||||
full_string = json_string1 + str(metrics_list) + json_string2
|
||||
try:
|
||||
ret = requests.post(kairos_url, data=json.dumps(json.loads(full_string)), timeout=200)
|
||||
assert ret.status_code == 204, "Wrong status code received from KairosDB"
|
||||
except AssertionError as e:
|
||||
print("Error: {}".format(str(e)))
|
||||
except Exception as e:
|
||||
print("Problem talking to KairosDB: {}".format(str(e)))
|
||||
return False
|
||||
print("Metrics sent to KairosDB. Check alerts in the #aom_test_channel in Slack")
|
||||
time.sleep(360)
|
||||
|
||||
try:
|
||||
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status1 + query_outro)), timeout=200)
|
||||
print("Recovery {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
|
||||
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong RECOVERY result"
|
||||
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status2 + query_outro)), timeout=200)
|
||||
print("Warning {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
|
||||
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong WARNING result"
|
||||
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status3 + query_outro)), timeout=200)
|
||||
print("Critical {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
|
||||
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 4, "Wrong CRITICAL result"
|
||||
except AssertionError as e:
|
||||
print("Error: {}".format(str(e)))
|
||||
except Exception as e:
|
||||
print("Problem getting results from KairosDB: {}".format(str(e)))
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
sleeper_agents_aom_engine/library/__init__.py
Executable file
0
sleeper_agents_aom_engine/library/__init__.py
Executable file
163
sleeper_agents_aom_engine/library/args.py
Executable file
163
sleeper_agents_aom_engine/library/args.py
Executable file
@@ -0,0 +1,163 @@
|
||||
# Contians the arg parser options.
|
||||
"""Contains the arg parser options."""
|
||||
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
def get_builder_args():
|
||||
"""
|
||||
Gets the arguments passed in to the aom_builder main call
|
||||
|
||||
:return: parser object
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generates a valid yaml file "
|
||||
"for alerting on metrics. If you are "
|
||||
"familiar with the yaml structure for an "
|
||||
"alert you don't have to use this builder,"
|
||||
" it's just convenient")
|
||||
parser.add_argument('-q', '--query', help="The Kariosdb query string to "
|
||||
"use")
|
||||
parser.add_argument(
|
||||
'-i', '--interval', type=int, default=60, help="The "
|
||||
"interval that the check will This value is in seconds")
|
||||
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
|
||||
"upper threshold is the value that when reached will "
|
||||
"cause an depending on the threshold logic. "
|
||||
"Use in conjunction with lower threshold to define a "
|
||||
"normal band.")
|
||||
parser.add_argument(
|
||||
'-b',
|
||||
'--lowerthreshold',
|
||||
help="The lower threshold is the value that when reached will cause an "
|
||||
"alert depending on the threshold logic"
|
||||
"Use in conjunction with upper threshold to define a normal band.")
|
||||
parser.add_argument(
|
||||
'-m',
|
||||
'--measure',
|
||||
choices=[
|
||||
'gt',
|
||||
'lt',
|
||||
'eq'],
|
||||
help="The measure to use to compare the "
|
||||
"threshold to the values of the alerts")
|
||||
parser.add_argument(
|
||||
'-a',
|
||||
'--alert_config',
|
||||
help='A valid Yaml representation of your alerting block')
|
||||
parser.add_argument(
|
||||
'-l',
|
||||
'--log_level',
|
||||
type=int,
|
||||
default=0,
|
||||
help="The log level for the aom_builder run. "
|
||||
"[0=Error, 1=Info, 2=Debug]")
|
||||
parser.add_argument(
|
||||
'-p',
|
||||
'--port',
|
||||
type=int,
|
||||
default=8080,
|
||||
help="The port to run the webapp on")
|
||||
|
||||
return args_to_dict(parser)
|
||||
|
||||
|
||||
def get_tester_service_args():
|
||||
"""
|
||||
Gets arguments passed into aom_tester.py
|
||||
Returns: parser object
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parameters to start the alerting on metrics dummy tester "
|
||||
"service")
|
||||
parser.add_argument(
|
||||
'-l',
|
||||
'--log_level',
|
||||
type=int,
|
||||
default=0,
|
||||
help="The log level for the aom_service app"
|
||||
"[0=Error, 1=Info, 2=Debug]")
|
||||
parser.add_argument(
|
||||
'-a',
|
||||
'--alert_configs',
|
||||
default=None,
|
||||
help="If provided will override the folder location read from the "
|
||||
"config with the value passed in. Is helpful for testing and "
|
||||
"troubleshooting alerts")
|
||||
parser.add_argument(
|
||||
'--hostname',
|
||||
help="If provided, will override the actual hostname check with this "
|
||||
"value")
|
||||
parser.add_argument(
|
||||
'-p',
|
||||
'--port',
|
||||
type=int,
|
||||
default=8080,
|
||||
help="The port to run the webapp on")
|
||||
return args_to_dict(parser)
|
||||
|
||||
|
||||
def get_service_args():
|
||||
"""
|
||||
Gets arguments passed into aom_service.py
|
||||
Returns: parser object
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parameters to start the alerting on metrics service")
|
||||
parser.add_argument(
|
||||
'-l',
|
||||
'--log_level',
|
||||
type=int,
|
||||
default=0,
|
||||
help="The log level for the aom_service app"
|
||||
"[0=Error, 1=Info, 2=Debug]")
|
||||
parser.add_argument(
|
||||
'-a',
|
||||
'--alert_configs',
|
||||
default=None,
|
||||
help="If provided will override the folder location read from the "
|
||||
"config with the value passed in. Is helpful for testing and "
|
||||
"troubleshooting alerts")
|
||||
parser.add_argument(
|
||||
'--alert_routing_lookup',
|
||||
default=None,
|
||||
help="If provided will override the folder used to fetch the alerts "
|
||||
"lookup configuration.")
|
||||
parser.add_argument(
|
||||
'-o',
|
||||
'--override',
|
||||
action='store_true',
|
||||
help="Overrides the check leader election value")
|
||||
parser.add_argument(
|
||||
'--hostname',
|
||||
help="If provided, will override the actual hostname check with this "
|
||||
"value")
|
||||
parser.add_argument(
|
||||
'-p',
|
||||
'--port',
|
||||
type=int,
|
||||
default=8080,
|
||||
help="The port to run the webapp on")
|
||||
return args_to_dict(parser)
|
||||
|
||||
|
||||
def args_to_dict(parsed_args):
|
||||
"""
|
||||
Converts the argument parser object to a dict
|
||||
Args:
|
||||
parsed_args: Arg parser object
|
||||
Returns:
|
||||
Dictionary of arguments
|
||||
"""
|
||||
try:
|
||||
arg_list = parsed_args.parse_args()
|
||||
# RETURN A DICT OF ARGUMENTS
|
||||
arg_dict = dict()
|
||||
for val in vars(arg_list):
|
||||
arg_dict[val] = getattr(arg_list, val)
|
||||
return arg_dict
|
||||
except argparse.ArgumentError:
|
||||
parsed_args.print_help()
|
||||
sys.exit(1)
|
||||
277
sleeper_agents_aom_engine/library/config.py
Executable file
277
sleeper_agents_aom_engine/library/config.py
Executable file
@@ -0,0 +1,277 @@
|
||||
# config.py
|
||||
"""Functions for loading alert configuration files"""
|
||||
import glob
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
import yaml
|
||||
import requests
|
||||
import traceback
|
||||
|
||||
# import logging
|
||||
# logger = logging.getLogger(__name__)
|
||||
|
||||
DEPENDENCIES_KEY = 'dependencies'
|
||||
|
||||
class AlertWithDependencies:
|
||||
def __init__(self, alertId, dependencies):
|
||||
self.alertId = alertId
|
||||
self.beenProcessed = False
|
||||
self.dependencies = []
|
||||
self.addAllDependencies(dependencies)
|
||||
|
||||
def addAllDependencies(self, moreDependencies):
|
||||
if moreDependencies is not None:
|
||||
self.dependencies.extend(moreDependencies)
|
||||
|
||||
def getDependencies(self):
|
||||
return self.dependencies
|
||||
|
||||
def getAlertId(self):
|
||||
return self.alertId
|
||||
|
||||
def hasBeenProcessed(self):
|
||||
return self.beenProcessed
|
||||
|
||||
def visit(self):
|
||||
self.beenProcessed = True
|
||||
|
||||
def md5(fname):
|
||||
"""Calculates md5 hash of a filename"""
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(fname, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def get_healthy_nodes_and_index(consul_url, hostname, logger):
|
||||
"""Find AOM healthy nodes on consult"""
|
||||
try:
|
||||
# getting all registered nodes from consul
|
||||
r = requests.get(
|
||||
consul_url +
|
||||
'/v1/catalog/service/alert-on-metrics',
|
||||
timeout=60)
|
||||
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
|
||||
|
||||
value = json.loads(r.text)
|
||||
node_list = []
|
||||
host_index = -1
|
||||
for elem in value:
|
||||
node_list.append(elem.get('Node'))
|
||||
|
||||
# Retrieving healthy nodes
|
||||
healthy_nodes = []
|
||||
for node in node_list:
|
||||
r2 = requests.get(
|
||||
consul_url +
|
||||
'/v1/health/node/' +
|
||||
node,
|
||||
timeout=60)
|
||||
assert r.status_code == 200, "Failed to get back a 200 from consul health"
|
||||
healthcheck_list = json.loads(r2.text)
|
||||
for check in healthcheck_list:
|
||||
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
|
||||
check.get('Status') == 'passing'):
|
||||
healthy_nodes.append(node)
|
||||
|
||||
try:
|
||||
healthy_nodes.sort()
|
||||
host_index = healthy_nodes.index(hostname)
|
||||
except ValueError:
|
||||
logger.error("Host is not healthy")
|
||||
except TimeoutError:
|
||||
logger.error("Timed out connecting to Consul")
|
||||
return host_index, len(healthy_nodes)
|
||||
|
||||
|
||||
def distribute_configs(
|
||||
filename,
|
||||
host_index,
|
||||
module,
|
||||
logger):
|
||||
"""Uses md5 of alert config to split the files among healthy servers"""
|
||||
if module == 0:
|
||||
logger.error("No healthy nodes for the service")
|
||||
return False
|
||||
if host_index == -1:
|
||||
logger.error("Host is unhealthy")
|
||||
return False
|
||||
if int(md5(filename), 16) % module == host_index:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_valid(alert_config, logger):
|
||||
"""Checks if alert has all required fields"""
|
||||
try:
|
||||
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
|
||||
assert alert_config['query'], "No Query, this is a dead config"
|
||||
#assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
|
||||
assert alert_config['id'], "Alert ID is empty, this is a dead config"
|
||||
if DEPENDENCIES_KEY in alert_config:
|
||||
assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
|
||||
if alert_config.get('query_type') == 'prometheus':
|
||||
assert isinstance(
|
||||
alert_config['query'], str), "Invalid Prometheus query"
|
||||
else:
|
||||
assert isinstance(
|
||||
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
|
||||
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
|
||||
{'', 'dc', 'fqdn'})
|
||||
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
|
||||
if 'group_by' in alert_config['query']['metrics'][0]:
|
||||
defined_tags.update(
|
||||
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
|
||||
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
|
||||
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
|
||||
# "prevent empty results".format(undefined_tag))
|
||||
# OUR MINIMUM THRESHOLD NEED
|
||||
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
|
||||
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
|
||||
"Config must have at least one threshold set."
|
||||
|
||||
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
|
||||
# AFTER CRITICAL
|
||||
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
|
||||
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
|
||||
"Lower Critical must be less than Lower Warning"
|
||||
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
|
||||
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
|
||||
"Upper Critical must be greater than Upper Warning"
|
||||
|
||||
if 'lookup' in alert_config['alerts']:
|
||||
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
|
||||
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
|
||||
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
|
||||
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
|
||||
assert all(
|
||||
isinstance(
|
||||
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
|
||||
|
||||
# if 'occurrences_threshold' in alert_config:
|
||||
# assert alert_config['occurrences_threshold'] >= 1, \
|
||||
# "Having an occurrences value less than 2 is assumed and pointless to specify"
|
||||
except Exception as e:
|
||||
logger.warning("Invalid config file: {}".format(str(e)))
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
|
||||
"""Check if routing lookup is properly configured"""
|
||||
try:
|
||||
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
|
||||
for alert_routing in alert_routing_lookup:
|
||||
assert 'alert' in alert_routing, "No alert defined for this configuration."
|
||||
assert 'tags' in alert_routing, "No tags value defined for this configuration."
|
||||
for tag in alert_routing['tags']:
|
||||
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
|
||||
tag)
|
||||
assert all(isinstance(tag, str)
|
||||
for tag in alert_routing['tags']), "Tags must be valid string"
|
||||
except AssertionError as e:
|
||||
logger.warning("Invalid alert routing config file: {}".format(str(e)))
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# noinspection PyBroadException
|
||||
def glob_the_configs(
|
||||
config_path,
|
||||
lookup_config_path,
|
||||
consul_url,
|
||||
hostname,
|
||||
logger):
|
||||
"""
|
||||
Args:
|
||||
config_path (string): relative path to the configs
|
||||
consul_url (string): url to consul service
|
||||
logger:
|
||||
Returns:
|
||||
List of configs
|
||||
"""
|
||||
invalid_configs = 0
|
||||
alert_list = []
|
||||
host_index, module = get_healthy_nodes_and_index(
|
||||
consul_url, hostname, logger)
|
||||
alertToAlertWithDependencies = {}
|
||||
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
|
||||
logger.debug("Found {} config".format(config_file))
|
||||
# LOAD CONFIG
|
||||
if distribute_configs(
|
||||
config_file,
|
||||
host_index,
|
||||
module,
|
||||
logger):
|
||||
try:
|
||||
alert = yaml.safe_load(open(config_file, 'rb').read())
|
||||
if is_valid(alert, logger):
|
||||
if 'lookup' in alert['alerts']:
|
||||
alert_routing_lookup = []
|
||||
is_valid_lookup = True
|
||||
if 'lookup_file' in alert['alerts']['lookup']:
|
||||
lookup_path = "{}/{}".format(
|
||||
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
|
||||
if os.path.isfile(lookup_path):
|
||||
alert_routing_lookup = yaml.safe_load(
|
||||
open(lookup_path, 'rb').read())
|
||||
else:
|
||||
is_valid_lookup = False
|
||||
else:
|
||||
alert_routing_lookup = alert['alerts']['lookup']['lookups']
|
||||
|
||||
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
|
||||
alert_routing_lookup, alert, logger)
|
||||
|
||||
if is_valid_lookup:
|
||||
alerts_per_tags = {}
|
||||
for alert_configuration in alert_routing_lookup:
|
||||
key = []
|
||||
for tag in alert['alerts']['lookup']['tags']:
|
||||
key.append(
|
||||
alert_configuration['tags'].get(tag))
|
||||
alerts_per_tags[tuple(
|
||||
key)] = alert_configuration['alert']
|
||||
alert['alert_routing_lookup'] = alerts_per_tags
|
||||
else:
|
||||
invalid_configs += 1
|
||||
continue
|
||||
alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
|
||||
alertToAlertWithDependencies[alert['id']] = alertWithDependencies
|
||||
alert['resolvedDependencies'] = alertWithDependencies
|
||||
alert_list.append(alert)
|
||||
else:
|
||||
invalid_configs += 1
|
||||
except BaseException:
|
||||
logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
|
||||
# validate the dependencies and flesh out the dependency graphs
|
||||
logger.debug("Iterating over dependencies")
|
||||
for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
|
||||
validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)
|
||||
|
||||
logger.info("Invalid configs: {}".format(invalid_configs))
|
||||
from serviceapp import service
|
||||
service.send_stat(
|
||||
'invalid_configs',
|
||||
invalid_configs,
|
||||
dict(),
|
||||
statprefix='aom')
|
||||
logger.info("Loaded {} configs".format(len(alert_list)))
|
||||
return alert_list
|
||||
|
||||
def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
|
||||
if len(alertWithDependencies.getDependencies()) > 0:
|
||||
if not alertWithDependencies.hasBeenProcessed():
|
||||
alertWithDependencies.visit()
|
||||
dependencies = list(alertWithDependencies.getDependencies())
|
||||
for dependentId in dependencies:
|
||||
if dependentId not in allAlerts:
|
||||
logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
|
||||
else:
|
||||
alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
|
||||
logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
|
||||
return alertWithDependencies.getDependencies()
|
||||
else:
|
||||
return None
|
||||
122
sleeper_agents_aom_engine/library/logger.py
Executable file
122
sleeper_agents_aom_engine/library/logger.py
Executable file
@@ -0,0 +1,122 @@
|
||||
# logger.py
|
||||
""" Logging configuration """
|
||||
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
|
||||
logging.getLogger('requests').setLevel(logging.ERROR)
|
||||
logging.getLogger('urllib3').setLevel(logging.ERROR)
|
||||
logging.getLogger('werkzeug').setLevel(logging.ERROR)
|
||||
|
||||
|
||||
class SingleLevelFilter(logging.Filter):
|
||||
def __init__(self, passlevel, reject):
|
||||
"""
|
||||
initilizer(constructor) of the singlelevelfilter
|
||||
@param passlevel (int) - the int value of the level of the log
|
||||
@param reject (bool) - if true will return if the record level is
|
||||
not equal to the passlevel
|
||||
@return SingleLevelFilter object
|
||||
@note Sets some object parameters
|
||||
"""
|
||||
self.passlevel = passlevel
|
||||
self.reject = reject
|
||||
|
||||
def filter(self, record):
|
||||
"""
|
||||
Returns True/False depending on parameters
|
||||
@param record (Log int) - the record that the filter belongs to
|
||||
@return bool - True/False depending on what self.reject is set to and
|
||||
what record.levelno and self.passlevel are set to
|
||||
@note This causes either only logging of the exact same level to get
|
||||
logged, or only logging other than the same level to get logged
|
||||
"""
|
||||
if self.reject:
|
||||
return record.levelno != self.passlevel
|
||||
return record.levelno == self.passlevel
|
||||
|
||||
|
||||
class AlertLogging(logging.Logger):
|
||||
"""
|
||||
Class Object to handle the logging of the alert on metrics service
|
||||
starts at Error level and can flip on (and add) an additional log file and
|
||||
Debug logger as needed.
|
||||
"""
|
||||
|
||||
def __init__(self, name):
|
||||
"""
|
||||
Inits the formaters and logger
|
||||
"""
|
||||
self.name = name
|
||||
self.debug_formatter = logging.Formatter(
|
||||
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
|
||||
"%(message)s", "%m-%d %H:%M:%S")
|
||||
|
||||
self.standard_formatter = logging.Formatter(
|
||||
"%(asctime)s - [%(levelname)s] - %(message)s", "%m-%d %H:%M:%S")
|
||||
logging.getLogger()
|
||||
logging.Logger.__init__(self, name, logging.DEBUG)
|
||||
logging.setLoggerClass(AlertLogging)
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
info_handler = logging.StreamHandler()
|
||||
info_handler.setLevel(logging.INFO)
|
||||
info_handler.setFormatter(self.standard_formatter)
|
||||
self.addHandler(info_handler)
|
||||
return self
|
||||
|
||||
def start_log_file(self, file_path, mode='a'):
|
||||
"""
|
||||
Creates a separate log file handler
|
||||
Args:
|
||||
file_path: path to the log file
|
||||
mode: the type of mode to open the file handler with
|
||||
Returns:
|
||||
|
||||
"""
|
||||
self.log_path = file_path
|
||||
work_folder = os.path.dirname(file_path)
|
||||
if work_folder and not os.path.exists(work_folder):
|
||||
os.makedirs(work_folder)
|
||||
self.log_handler = logging.FileHandler(file_path, mode)
|
||||
self.log_handler.setLevel(logging.WARNING)
|
||||
self.log_handler.setFormatter(self.debug_formatter)
|
||||
self.addHandler(self.log_handler)
|
||||
|
||||
def stop_log_file(self):
|
||||
"""
|
||||
Closes Log file and sets the handler to None
|
||||
Returns:
|
||||
|
||||
"""
|
||||
self.log_handler.close()
|
||||
self.removeHandler(self.log_handler)
|
||||
self.log_handler = None
|
||||
|
||||
def start_debug(self):
|
||||
"""
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
self.debug_handler = logging.StreamHandler()
|
||||
self.debug_handler.setLevel(logging.DEBUG)
|
||||
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
|
||||
self.debug_handler.setFormatter(self.debug_formatter)
|
||||
self.addHandler(self.debug_handler)
|
||||
|
||||
def stop_debug(self):
|
||||
"""
|
||||
stop the debugger
|
||||
Returns:
|
||||
|
||||
"""
|
||||
self.removeHandler(self.debug_handler)
|
||||
self.debug_handler = None
|
||||
83
sleeper_agents_aom_engine/library/prom_api.py
Executable file
83
sleeper_agents_aom_engine/library/prom_api.py
Executable file
@@ -0,0 +1,83 @@
|
||||
from datetime import datetime, timedelta
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class PromAPI:
|
||||
def __init__(self, endpoint='http://127.0.0.1:9090/'):
|
||||
"""
|
||||
:param endpoint: address of
|
||||
"""
|
||||
self.endpoint = endpoint
|
||||
|
||||
@staticmethod
|
||||
def _to_timestamp(input_):
|
||||
"""
|
||||
Convert string input to UNIX timestamp for Prometheus
|
||||
:param input_:
|
||||
:return:
|
||||
"""
|
||||
if type(input_) == datetime:
|
||||
return input_.timestamp()
|
||||
if input_ == 'now':
|
||||
return datetime.utcnow().isoformat('T')
|
||||
if type(input_) is str:
|
||||
input_ = float(input_)
|
||||
if type(input_) in [int, float]:
|
||||
if input_ > 0:
|
||||
return input_
|
||||
if input_ == 0: # return now
|
||||
return datetime.utcnow().isoformat('T')
|
||||
if input_ < 0:
|
||||
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
|
||||
#assert type(input_) == float
|
||||
|
||||
def query(self, query='prometheus_build_info'):
|
||||
return self._get(
|
||||
uri='/api/v1/query',
|
||||
params=dict(
|
||||
query=query
|
||||
)
|
||||
)
|
||||
|
||||
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
|
||||
"""Get ser"""
|
||||
params = {
|
||||
'query': query
|
||||
}
|
||||
if end is not None:
|
||||
params['end'] = self._to_timestamp(end) + 'Z'
|
||||
if start:
|
||||
params['start'] = self._to_timestamp(start) + 'Z'
|
||||
if duration:
|
||||
params['step'] = duration
|
||||
print(params)
|
||||
return self._get(
|
||||
uri='/api/v1/query_range',
|
||||
params=params
|
||||
)
|
||||
|
||||
def series(self, match='prometheus_build_info', start=-86400, end='now'):
|
||||
"""Get ser"""
|
||||
params = {
|
||||
'match[]': match
|
||||
}
|
||||
if end is not None:
|
||||
params['end'] = self._to_timestamp(end) + 'Z'
|
||||
if start:
|
||||
params['start'] = self._to_timestamp(start) + 'Z'
|
||||
print(params)
|
||||
return self._get(
|
||||
uri='/api/v1/series',
|
||||
params=params
|
||||
)
|
||||
|
||||
def _get(self, uri, params, method='GET'):
|
||||
url = urljoin(self.endpoint, uri)
|
||||
assert method == 'GET'
|
||||
result = requests.get(
|
||||
url=url,
|
||||
params=params
|
||||
)
|
||||
return result.json()
|
||||
47
sleeper_agents_aom_engine/library/test_config.py
Executable file
47
sleeper_agents_aom_engine/library/test_config.py
Executable file
@@ -0,0 +1,47 @@
|
||||
import unittest
|
||||
import config
|
||||
|
||||
class TestAlertWithDependencies(unittest.TestCase) :
|
||||
def test_base(self) :
|
||||
self.alertToAlertWithDependencies = {}
|
||||
self.alert_list = []
|
||||
self.make_alert("A", ["C"])
|
||||
self.make_alert("B", ["C"])
|
||||
self.make_alert("C", ["D"])
|
||||
self.make_alert("D", None)
|
||||
self.validate()
|
||||
self.checkDepLen("A", 2)
|
||||
self.checkDepLen("B", 2)
|
||||
self.checkDepLen("C", 1)
|
||||
self.checkDepLen("D", 0)
|
||||
|
||||
def make_alert(self, id, depends) :
|
||||
alert = {
|
||||
'id': id,
|
||||
'dependencies': depends
|
||||
}
|
||||
alertWithDependencies = config.AlertWithDependencies(alert['id'], alert[config.DEPENDENCIES_KEY] if config.DEPENDENCIES_KEY in alert else None)
|
||||
self.alertToAlertWithDependencies[alert['id']] = alertWithDependencies
|
||||
alert['resolvedDependencies'] = alertWithDependencies
|
||||
self.alert_list.append(alert)
|
||||
|
||||
def validate(self) :
|
||||
for id, awd in self.alertToAlertWithDependencies.items() :
|
||||
config.validateDependencies(id, awd, self.alertToAlertWithDependencies, MockLogger())
|
||||
|
||||
def checkDepLen(self, id, n) :
|
||||
dep = self.alertToAlertWithDependencies[id]
|
||||
self.assertEqual(len(dep.getDependencies()), n)
|
||||
|
||||
class MockLogger() :
|
||||
def __init__(self) :
|
||||
return
|
||||
def info(self, *args, **kwargs) :
|
||||
return
|
||||
def debug(self, *args, **kwargs) :
|
||||
return
|
||||
def error(self, *args, **kwargs) :
|
||||
return
|
||||
|
||||
if __name__ == "__main__" :
|
||||
unittest.main()
|
||||
42
sleeper_agents_aom_engine/publish.sh
Executable file
42
sleeper_agents_aom_engine/publish.sh
Executable file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
|
||||
GIT_COMMIT=$(git rev-parse HEAD)
|
||||
|
||||
if [[ $GIT_COMMIT == "" ]]; then
|
||||
echo "--Missing required GIT_COMMIT var. Aborting..."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#Setup useful vars
|
||||
team="engvis"
|
||||
app="alert-on-metrics-app"
|
||||
|
||||
registryV2="registry-app.eng.qops.net:5001"
|
||||
pathV2="${registryV2}/${team}/${app}"
|
||||
commitV2="${pathV2}:${GIT_COMMIT}"
|
||||
latestV2="${pathV2}:latest"
|
||||
|
||||
# In case you use relative paths
|
||||
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
|
||||
cd $DIR
|
||||
|
||||
echo "--Publishing $app $GIT_COMMIT"
|
||||
|
||||
echo "--Removing old image, so they don't accumulate"
|
||||
docker rmi $latestV2
|
||||
|
||||
#Now fail if anything doesn't work
|
||||
set -e
|
||||
|
||||
if [ -f $app/build.sh ]
|
||||
then
|
||||
echo "--Running pre build steps"
|
||||
$app/build.sh
|
||||
fi
|
||||
|
||||
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
|
||||
|
||||
echo "--Publishing app container"
|
||||
|
||||
docker push $commitV2
|
||||
docker push $latestV2
|
||||
48
sleeper_agents_aom_engine/qvolution.sh
Executable file
48
sleeper_agents_aom_engine/qvolution.sh
Executable file
@@ -0,0 +1,48 @@
|
||||
function _get_and_save_secret() {
|
||||
function is_set() {
|
||||
local name="$1"
|
||||
eval "echo \$$name" | grep . > /dev/null
|
||||
}
|
||||
local name="$1"
|
||||
eval "$name=\${$name:-}"
|
||||
if ! is_set $name; then
|
||||
eval "$name=$(security find-generic-password -a $USER -s $name -w 2> /dev/null)"
|
||||
if ! is_set "$name"; then
|
||||
eval "read -s -p 'Enter $name: ' $name" >&2
|
||||
eval "security add-generic-password -a $USER -s $name -w \$$name" >&2
|
||||
echo "" >&2
|
||||
fi
|
||||
fi
|
||||
eval "echo \$$name"
|
||||
}
|
||||
function get_and_save_secret() {
|
||||
_get_and_save_secret "$@" | tail -n 1
|
||||
}
|
||||
SENSU_API_USER="$(get_and_save_secret SENSU_API_USER)"
|
||||
SENSU_API_PASS="$(get_and_save_secret SENSU_API_PASS)"
|
||||
SLACK_API_TOKEN="$(get_and_save_secret SLACK_API_TOKEN)"
|
||||
|
||||
echo SENSU_USER=$SENSU_API_USER >&2
|
||||
echo SENSU_PASS=$SENSU_API_PASS >&2
|
||||
echo SLACK_TOKEN=$SLACK_API_TOKEN >&2
|
||||
|
||||
git submodule update --remote
|
||||
rm -rf alert_configs
|
||||
cp -r AoM_Configs/alert_configs .
|
||||
docker build -t aom:dev .
|
||||
|
||||
docker rm -f aom
|
||||
docker run \
|
||||
-e SLACK_API_TOKEN=${SLACK_API_TOKEN} \
|
||||
-e API_USER=$SENSU_API_USER \
|
||||
-e API_PASS=$SENSU_API_PASS \
|
||||
--rm \
|
||||
-d \
|
||||
-p 8080:8080 \
|
||||
--add-host telegraf:10.4.13.53 \
|
||||
--name aom \
|
||||
--add-host consul.service.consul:127.0.0.1 \
|
||||
-h 127.0.0.1 \
|
||||
aom:dev &
|
||||
until curl localhost:8080/healthcheck; do sleep 1; done
|
||||
docker logs -f aom
|
||||
0
sleeper_agents_aom_engine/reporter/incoming/__init__.py
Executable file
0
sleeper_agents_aom_engine/reporter/incoming/__init__.py
Executable file
14
sleeper_agents_aom_engine/reporter/incoming/main.py
Executable file
14
sleeper_agents_aom_engine/reporter/incoming/main.py
Executable file
@@ -0,0 +1,14 @@
|
||||
import os
|
||||
import logging
|
||||
from receiver import SlackReceiver
|
||||
from sender import SlackSender
|
||||
|
||||
if __name__ == "__main__":
|
||||
log = logging.getLogger()
|
||||
log.setLevel(logging.DEBUG)
|
||||
log.addHandler(logging.StreamHandler())
|
||||
|
||||
slack_token = os.environ["SLACK_API_TOKEN"]
|
||||
sender = SlackSender(slack_token, log)
|
||||
receiver = SlackReceiver(slack_token, log, sender.respond)
|
||||
receiver.start()
|
||||
12
sleeper_agents_aom_engine/reporter/incoming/message.py
Executable file
12
sleeper_agents_aom_engine/reporter/incoming/message.py
Executable file
@@ -0,0 +1,12 @@
|
||||
class SlackMessage() :
|
||||
def __init__(self, payload) :
|
||||
data = payload['data']
|
||||
for i in data :
|
||||
try :
|
||||
setattr(self, i, getattr(data, i))
|
||||
except Exception :
|
||||
setattr(self, i, data[i])
|
||||
attrs = dir(self)
|
||||
assert "text" in attrs, "no text in message"
|
||||
assert "user" in attrs, "no user in message"
|
||||
assert "channel" in attrs, "no channel in message"
|
||||
52
sleeper_agents_aom_engine/reporter/incoming/receiver.py
Executable file
52
sleeper_agents_aom_engine/reporter/incoming/receiver.py
Executable file
@@ -0,0 +1,52 @@
|
||||
import slack
|
||||
import os
|
||||
import re
|
||||
import ssl as ssl_lib
|
||||
import certifi
|
||||
|
||||
from message import SlackMessage
|
||||
|
||||
class SlackReceiver() :
|
||||
def __init__(self, token, log, callback) :
|
||||
self.token = token
|
||||
self.ssl_context = ssl_lib.create_default_context(cafile=certifi.where())
|
||||
self.callback = callback
|
||||
self.log = log
|
||||
|
||||
def start(self) :
|
||||
self.rtm_client = slack.RTMClient(token=self.token, ssl=self.ssl_context)
|
||||
@slack.RTMClient.run_on(event="message")
|
||||
def receive(**payload) :
|
||||
msg = self.parse(payload)
|
||||
if msg is not None:
|
||||
self.receive(msg)
|
||||
print("Starting")
|
||||
self.rtm_client.start()
|
||||
|
||||
def parse(self, payload) :
|
||||
self.log.debug("slack message received: {}".format(payload))
|
||||
if 'data' in payload and 'bot_id' in payload['data'] and payload['data']['bot_id'] == 'BNYAX72BB':
|
||||
# it's the bot's response, ignore it
|
||||
return None
|
||||
if 'data' in payload and 'user' in payload['data'] and payload['data']['user'] == 'UNS0QKMMY':
|
||||
# it's the bot uploading files, ignore it
|
||||
return None
|
||||
if 'data' in payload and (
|
||||
('text' in payload['data'] and '<@UNS0QKMMY>' not in payload['data']['text'])
|
||||
and ('channel' in payload['data'] and not payload['data']['channel'].startswith('DP'))
|
||||
):
|
||||
# message in a channel and the bot wasn't pinged, or was not a direct message - ignore
|
||||
self.log.debug("received message, but I wasn't pinged or DM'ed")
|
||||
return None
|
||||
if 'data' in payload and 'text' in payload['data'] and '<@UNS0QKMMY>' in payload['data']['text']:
|
||||
# remove the ping text
|
||||
payload['data']['text'] = re.sub('\\<@UNS0QKMMY\\>', '', payload['data']['text'])
|
||||
return SlackMessage(payload)
|
||||
|
||||
def receive(self, msg) :
|
||||
text = msg.text.split()
|
||||
channel = msg.channel
|
||||
ID = text[0] if len(text) > 0 else None
|
||||
interval = text[1] if len(text) > 1 else None
|
||||
step = text[2] if len(text) > 2 else None
|
||||
self.callback(channel, ID, interval, step)
|
||||
131
sleeper_agents_aom_engine/reporter/incoming/sender.py
Executable file
131
sleeper_agents_aom_engine/reporter/incoming/sender.py
Executable file
@@ -0,0 +1,131 @@
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
import traceback
|
||||
import time
|
||||
import io
|
||||
import binascii
|
||||
import logging
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import datetime
|
||||
import re
|
||||
|
||||
rootDirectory = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(rootDirectory)
|
||||
from library.config import glob_the_configs
|
||||
from library.prom_api import PromAPI
|
||||
|
||||
HOSTNAME = None
|
||||
|
||||
|
||||
class SlackSender:
|
||||
def __init__(self, token, log):
|
||||
self.token = token
|
||||
self.alertList = []
|
||||
self.log = log
|
||||
try:
|
||||
self.alertList = glob_the_configs(rootDirectory, \
|
||||
rootDirectory + "/AoM_Configs/alert_routing_lookup", \
|
||||
'http://consul.service.consul:8500', '127.0.0.1', log)
|
||||
except Exception:
|
||||
log.error("Failed to load config files: {}".format(traceback.format_exc()))
|
||||
|
||||
def respond(self, channel, alertId, interval, step):
|
||||
self.log.debug("incomming message: channel: {} alert ID: {} interval: {} step: {}".format(channel, alertId, interval, step))
|
||||
matchingAlert = next((alert for alert in self.alertList if alert['id'] == alertId), None)
|
||||
if not matchingAlert is None:
|
||||
query_args = {
|
||||
'interval' : matchingAlert['interval'],
|
||||
'start_time' : matchingAlert['start_time'],
|
||||
'end_time' : matchingAlert['end_time'],
|
||||
'query' : matchingAlert['query'],
|
||||
}
|
||||
prom_api = PromAPI(endpoint=matchingAlert['prometheus_url'])
|
||||
if interval :
|
||||
try:
|
||||
dur = parse_go_duration(interval)
|
||||
if dur < 60 :
|
||||
dur = 60
|
||||
end = 0
|
||||
start = -1 * dur
|
||||
query_args['start_time'] = start
|
||||
query_args['end_time'] = end
|
||||
except Exception:
|
||||
pass
|
||||
if step :
|
||||
query_args['interval'] = step
|
||||
self.log.debug("QUERY_ARGS {} FROM {} {}".format(query_args, interval, step))
|
||||
ret = prom_api.query_range(
|
||||
query=query_args['query'],
|
||||
start=query_args['start_time'],
|
||||
end=query_args['end_time'],
|
||||
duration=query_args['interval'])
|
||||
if 'status' in ret and ret['status'] == 'success' and 'data' in ret and 'result' in ret['data'] and len(ret['data']['result']) > 0 and 'values' in ret['data']['result'][0] and ret['data']['result'][0]['values'] is not None and len(ret['data']['result'][0]['values']) > 0:
|
||||
resultsForGraph = {}
|
||||
for row in ret['data']['result'][0]['values']:
|
||||
resultsForGraph[row[0]] = row[1]
|
||||
finalResults = {}
|
||||
for res in resultsForGraph:
|
||||
finalResults[time.strftime('%H:%M:%S', time.localtime(res))] = float(resultsForGraph[res])
|
||||
plt.clf()
|
||||
plt.plot(list(finalResults.keys()), list(finalResults.values()))
|
||||
plt.suptitle(alertId + " (all times UTC)")
|
||||
if len(finalResults.keys()) > 5:
|
||||
tickTuples = [(index, x) for index, x in enumerate(finalResults.keys()) if index % int(len(finalResults.keys()) / 5) == 0]
|
||||
tickList = []
|
||||
for pair in tickTuples:
|
||||
tickList.append(pair[1])
|
||||
plt.xticks(ticks = tickList, rotation='vertical')
|
||||
else:
|
||||
plt.xticks(rotation='vertical')
|
||||
plt.ylim(bottom = 0)
|
||||
plt.subplots_adjust(bottom=0.2)
|
||||
pngData = io.BytesIO()
|
||||
fig = plt.gcf()
|
||||
fig.savefig(pngData, format = 'png')
|
||||
self.sendGraph(channel, pngData)
|
||||
else:
|
||||
self.log.debug("didn't meet criteria")
|
||||
self.sendQueryResults(channel, ret)
|
||||
else:
|
||||
self.sendQueryResults(channel, "Sorry, I couldn't find a matching alert with ID {}".format(alertId))
|
||||
|
||||
def sendQueryResults(self, channelId, queryResults):
|
||||
response = requests.post('https://slack.com/api/chat.postMessage',
|
||||
headers = {
|
||||
'Authorization': "Bearer " + self.token,
|
||||
'Content-Type': 'application/json; charset=utf-8'
|
||||
},
|
||||
json = { 'text': queryResults, 'channel': channelId }
|
||||
)
|
||||
self.log.debug("slack response: {}".format(response.text))
|
||||
|
||||
def sendGraph(self, channelId, rawData):
|
||||
request = requests.Request('POST', 'https://slack.com/api/files.upload',
|
||||
data = { 'token': self.token, 'filetype': 'png', 'channels': channelId },
|
||||
files = { 'file': ('graph.png', rawData.getvalue(), 'image/png')}
|
||||
).prepare()
|
||||
self.log.debug("headers to send to Slack: {}".format('\r\n'.join('{}: {}'.format(k, v) for k, v in request.headers.items())))
|
||||
self.log.debug("body to send to Slack: {}".format(len(request.body)))
|
||||
response = requests.Session().send(request)
|
||||
self.log.debug("slack response: {}".format(response.text))
|
||||
|
||||
def setAlertList(self, newAlertList):
|
||||
# TODO can rework to be a dictionary for faster lookup if necessary
|
||||
self.alertList = newAlertList
|
||||
|
||||
def parse_go_duration(duration) :
|
||||
duration = str(duration)
|
||||
e = Exception("invalid duration "+duration)
|
||||
if not re.match("^[0-9][0-9]*[a-z]$", duration) :
|
||||
raise e
|
||||
unit = duration[-1:]
|
||||
n = int(duration[:-1])
|
||||
if unit == "s" :
|
||||
return n
|
||||
if unit == "m" :
|
||||
return 60*n
|
||||
if unit == "h" :
|
||||
return 60*60*n
|
||||
raise e
|
||||
14
sleeper_agents_aom_engine/reporter/incoming/test.sh
Executable file
14
sleeper_agents_aom_engine/reporter/incoming/test.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#! /bin/bash
|
||||
|
||||
SLACK_API_TOKEN=${SLACK_API_TOKEN:-}
|
||||
if [ -z "$SLACK_API_TOKEN" ]; then
|
||||
SLACK_API_TOKEN=$(security find-generic-password -a $USER -s SLACK_API_TOKEN -w 2> /dev/null)
|
||||
if [ -z "$SLACK_API_TOKEN" ]; then
|
||||
read -s -p "Enter SLACK_API_TOKEN" SLACK_API_TOKEN
|
||||
echo ""
|
||||
fi
|
||||
fi
|
||||
security delete-generic-password -a $USER -s SLACK_API_TOKEN 2> /dev/null 1>&2
|
||||
security add-generic-password -a $USER -s SLACK_API_TOKEN -w $SLACK_API_TOKEN 1>&2
|
||||
|
||||
SLACK_BOT_TOKEN=$SLACK_API_TOKEN python3 ./main.py
|
||||
16
sleeper_agents_aom_engine/reporter/incoming/test_message.py
Executable file
16
sleeper_agents_aom_engine/reporter/incoming/test_message.py
Executable file
@@ -0,0 +1,16 @@
|
||||
import unittest
|
||||
|
||||
from message import SlackMessage
|
||||
|
||||
class Test_SlackMessage(unittest.TestCase) :
|
||||
def test_basic(self) :
|
||||
def recv(*args, **payload) :
|
||||
return SlackMessage(payload)
|
||||
try :
|
||||
recv(data={"hello":"world"})
|
||||
self.fail("did not raise exception at bad slack message payload")
|
||||
except Exception :
|
||||
pass
|
||||
|
||||
if __name__ == "__main__" :
|
||||
unittest.main()
|
||||
14
sleeper_agents_aom_engine/reporter/incoming/test_receiver.py
Executable file
14
sleeper_agents_aom_engine/reporter/incoming/test_receiver.py
Executable file
@@ -0,0 +1,14 @@
|
||||
import unittest
|
||||
|
||||
from receiver import SlackReceiver
|
||||
from message import SlackMessage
|
||||
|
||||
class Test_SlackReceiver(unittest.TestCase) :
|
||||
def test_basic(self) :
|
||||
r = SlackReceiver("token", print)
|
||||
msg = r.parse(data={"user":"u", "text":"text", "channel":"channel"})
|
||||
self.assertTrue(isinstance(msg, SlackMessage))
|
||||
r.receive(msg)
|
||||
|
||||
if __name__ == "__main__" :
|
||||
unittest.main()
|
||||
35
sleeper_agents_aom_engine/reporter/incoming/test_sender.py
Executable file
35
sleeper_agents_aom_engine/reporter/incoming/test_sender.py
Executable file
@@ -0,0 +1,35 @@
|
||||
import unittest
|
||||
import sender
|
||||
|
||||
class Test_Compute_Relative_Time(unittest.TestCase) :
|
||||
def test_basic(self) :
|
||||
class MockDateTime() :
|
||||
t = 1000000
|
||||
def __init__(self) :
|
||||
self.datetime = self
|
||||
def fromtimestamp(self, t) :
|
||||
self.t = t
|
||||
def timestamp(self) :
|
||||
return self.t
|
||||
def utcnow(self) :
|
||||
return self
|
||||
mock_date_time = MockDateTime()
|
||||
was = sender.datetime
|
||||
sender.datetime = mock_date_time
|
||||
self.case("1s", 1)
|
||||
self.case("5s", 5)
|
||||
self.case("0s", 0)
|
||||
self.case("0m", 0)
|
||||
self.case("1m", 60)
|
||||
self.case("9m", 9*60)
|
||||
self.case("0h", 0)
|
||||
self.case("1h", 60*60)
|
||||
self.case("9h", 9*60*60)
|
||||
sender.datetime = was
|
||||
|
||||
def case(self, duration, expected) :
|
||||
seconds = sender.parse_go_duration(duration)
|
||||
self.assertEqual(seconds, expected)
|
||||
|
||||
if __name__ == "__main__" :
|
||||
unittest.main()
|
||||
11
sleeper_agents_aom_engine/requirements.txt
Executable file
11
sleeper_agents_aom_engine/requirements.txt
Executable file
@@ -0,0 +1,11 @@
|
||||
PyYAML
|
||||
pip
|
||||
setuptools
|
||||
requests
|
||||
pyaml
|
||||
sanic
|
||||
statsd-tags
|
||||
redis
|
||||
certifi
|
||||
slackclient
|
||||
matplotlib
|
||||
62
sleeper_agents_aom_engine/run.sh
Executable file
62
sleeper_agents_aom_engine/run.sh
Executable file
@@ -0,0 +1,62 @@
|
||||
#!/bin/ash
|
||||
|
||||
(
|
||||
while true; do
|
||||
redis-server
|
||||
sleep 10
|
||||
done
|
||||
) &
|
||||
/usr/src/app/echo-server &
|
||||
/usr/src/app/echo-server -p 443 &
|
||||
/usr/src/app/consul &
|
||||
|
||||
# Default values
|
||||
KAIROSDB_URL=${KAIROSDB_URL:-http://kairosdb-metrics.service.eng.consul:8080/}
|
||||
SMTP_SERVER=${SMTP_SERVER:-internal-smtp1-app.eng.qops.net:2525}
|
||||
SENSU_URL=${SENSU_URL:-https://sensu-api.eng.qops.net:443/results}
|
||||
#SLACK_TOKEN=${SLACK_TOKEN:-xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81}
|
||||
#VICTOROPS_URL=${VICTOROPS_URL:-https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/}
|
||||
#CONSUL_URL=${CONSUL_URL:-http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock}
|
||||
#AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-https://grafana.eng.qops.net/d/000000113/alert-on-metrics?refresh=1m&orgId=1&var-dc=All&var-fqdn=All&from=now-6h&to=now&var-id=}
|
||||
#UCHIWA_URL=${UCHIWA_URL:-https://uchiwa-app.eng.qops.net/#/client/EngOps/AOM}
|
||||
|
||||
SLACK_TOKEN=${SLACK_TOKEN:-na}
|
||||
VICTOROPS_URL=${VICTOROPS_URL:-http://localhost:41912/}
|
||||
CONSUL_URL=${CONSUL_URL:-http://localhost:41912/}
|
||||
AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-http://localhost:41912/}
|
||||
UCHIWA_URL=${UCHIWA_URL:-http://localhost:41912/}
|
||||
|
||||
export AOM_GRAFANA_URL
|
||||
|
||||
# Update config
|
||||
sed -i "s#{{{KAIROSDB_URL}}}#${KAIROSDB_URL}#g" service.yaml
|
||||
sed -i "s#{{{VICTOROPS_URL}}}#${VICTOROPS_URL}#g" service.yaml
|
||||
sed -i "s#{{{SLACK_TOKEN}}}#${SLACK_TOKEN}#g" service.yaml
|
||||
sed -i "s#{{{SMTP_SERVER}}}#${SMTP_SERVER}#g" service.yaml
|
||||
sed -i "s#{{{CONSUL_URL}}}#${CONSUL_URL}#g" service.yaml
|
||||
sed -i "s#{{{SENSU_URL}}}#${SENSU_URL}#g" service.yaml
|
||||
sed -i "s,{{{UCHIWA_URL}}},${UCHIWA_URL},g" service.yaml
|
||||
# Starting service
|
||||
|
||||
if [ -n "${TEST}" ]; then
|
||||
sed -i '/alert_reload_interval:/ s/[0-9]\+/30/g' service.yaml
|
||||
python3 /usr/src/app/aom_service.py &
|
||||
sleep 17
|
||||
echo "Making current server leader"
|
||||
curl localhost:8080/override?enable=true
|
||||
echo "Starting the service"
|
||||
curl localhost:8080/healthcheck
|
||||
exec python3 /usr/src/app/aom_test.py
|
||||
if [ $? -ne 0 ]; then
|
||||
cat /usr/src/app/logs/aom_service.log
|
||||
echo "Test failed!"
|
||||
exit 1
|
||||
else
|
||||
cat /usr/src/app/logs/aom_service.log
|
||||
echo "Test succeeded. Exiting"
|
||||
exit 0
|
||||
fi
|
||||
else
|
||||
exec python3 /usr/src/app/reporter/incoming/main.py &
|
||||
exec python3 /usr/src/app/aom_service.py
|
||||
fi
|
||||
27
sleeper_agents_aom_engine/service.yaml
Executable file
27
sleeper_agents_aom_engine/service.yaml
Executable file
@@ -0,0 +1,27 @@
|
||||
#=======================#
|
||||
# All them URLS and tokens
|
||||
#=======================#
|
||||
kairosdb_url: "{{{KAIROSDB_URL}}}"
|
||||
victorops_url: "{{{VICTOROPS_URL}}}"
|
||||
slack_url: "https://slack.com/api/chat.postMessage"
|
||||
slack_token: "{{{SLACK_TOKEN}}}"
|
||||
smtp_server: "{{{SMTP_SERVER}}}"
|
||||
consul_url: "{{{CONSUL_URL}}}"
|
||||
sensu_endpoint: "{{{SENSU_URL}}}"
|
||||
uchiwa_url: "{{{UCHIWA_URL}}}"
|
||||
#=======================#
|
||||
# Logging Information
|
||||
#=======================#
|
||||
log_path: "logs/aom_service.log"
|
||||
|
||||
#=======================#
|
||||
# alerts configurations
|
||||
#=======================#
|
||||
alert_folder: "alert_configs"
|
||||
alert_routing_lookup: "alert_routing_lookup"
|
||||
alert_reload_interval: 300
|
||||
|
||||
#=======================#
|
||||
# request timeout value
|
||||
#=======================#
|
||||
timeout: 90
|
||||
0
sleeper_agents_aom_engine/serviceapp/__init__.py
Executable file
0
sleeper_agents_aom_engine/serviceapp/__init__.py
Executable file
1073
sleeper_agents_aom_engine/serviceapp/service.py
Executable file
1073
sleeper_agents_aom_engine/serviceapp/service.py
Executable file
File diff suppressed because it is too large
Load Diff
99
sleeper_agents_aom_engine/serviceapp/test_service.py
Executable file
99
sleeper_agents_aom_engine/serviceapp/test_service.py
Executable file
@@ -0,0 +1,99 @@
|
||||
import unittest
|
||||
|
||||
import service
|
||||
|
||||
class TestMockRedis(unittest.TestCase) :
|
||||
def test_base(self) :
|
||||
from redis import Redis
|
||||
r = service.get_redis_client()
|
||||
self.assertTrue(not isinstance(r, Redis))
|
||||
self.assertEqual(r.get("a"), None)
|
||||
self.assertEqual(r.set("a", "b"), None)
|
||||
self.assertEqual(r.call("KEYS", "b*"), [])
|
||||
def test_test(self) :
|
||||
r = TestRedis()
|
||||
r.set("a", "b")
|
||||
self.assertEqual(r.get("a"), "b")
|
||||
self.assertEqual(r.call("KEYS", "a*"), ["a"])
|
||||
|
||||
DB = None
|
||||
class TestRedis(service.MockRedis) :
|
||||
def __init__(self) :
|
||||
global DB
|
||||
if DB is None :
|
||||
DB = {}
|
||||
def get(self, key) :
|
||||
global DB
|
||||
return DB[key] if key in DB else None
|
||||
def delete(self, key) :
|
||||
global DB
|
||||
if key in DB :
|
||||
del(DB[key])
|
||||
def set(self, key, value) :
|
||||
global DB
|
||||
DB[key] = value
|
||||
def call(self, cmd, arg) :
|
||||
global DB
|
||||
if not cmd is "KEYS" :
|
||||
return None
|
||||
return [i for i in DB.keys() if i.startswith(arg.strip("*"))]
|
||||
|
||||
class TestSetFiring(unittest.TestCase) :
|
||||
def test_base(self) :
|
||||
def mock_get_redis_client() :
|
||||
return TestRedis()
|
||||
service.get_redis_client = mock_get_redis_client
|
||||
service.set_firing("TestSetFiring", [
|
||||
[0, 0, 0, {"dc": "there"}],
|
||||
[0, 0, 0, {"dc": "here"}],
|
||||
])
|
||||
self.assertEqual(len(service.list_firing("TestSetFiring")), 2)
|
||||
service.set_firing("TestSetFiring", [
|
||||
[0, 0, 0, {"dc": "here"}],
|
||||
])
|
||||
self.assertEqual(len(service.list_firing("TestSetFiring")), 1)
|
||||
|
||||
class MockResolveDep():
|
||||
def __init__(self, l) :
|
||||
self.l = l
|
||||
def getDependencies(self) :
|
||||
return self.l
|
||||
|
||||
class TestIsSuppressed(unittest.TestCase) :
|
||||
def test_base(self) :
|
||||
def mock_get_redis_client() :
|
||||
return TestRedis()
|
||||
service.get_redis_client = mock_get_redis_client
|
||||
|
||||
alert_config = {
|
||||
'id': "TestIsSuppressed",
|
||||
'resolvedDependencies': MockResolveDep(["TestIsSuppressedD", "b", "c"]),
|
||||
'suppressed_occurrences_threshold': 2,
|
||||
}
|
||||
alert_tags = {"dc":"z", "x":"y"}
|
||||
|
||||
# dependency fires one alert, suppress in effect
|
||||
service.set_firing("TestIsSuppressedD", [[0, 0, 0, alert_tags]])
|
||||
service.clear_suppressed(alert_config, alert_tags)
|
||||
self.assertTrue(service.is_suppressed(alert_config, alert_tags))
|
||||
|
||||
# dependency still firing alert, suppress stops
|
||||
service.set_firing("TestIsSuppressedD", [[0, 0, 0, alert_tags]])
|
||||
service.clear_suppressed(alert_config, alert_tags)
|
||||
self.assertFalse(service.is_suppressed(alert_config, alert_tags))
|
||||
|
||||
# dependency in different dc fires alert, suppress in DC1 stops, suppress in DC2 starts
|
||||
new_alert_tags = {"dc":"w"}
|
||||
service.set_firing("TestIsSuppressedD", [[0, 0, 0, new_alert_tags]])
|
||||
service.clear_suppressed(alert_config, new_alert_tags)
|
||||
self.assertFalse(service.is_suppressed(alert_config, alert_tags))
|
||||
self.assertTrue(service.is_suppressed(alert_config, new_alert_tags))
|
||||
|
||||
# dependencies clear everywhere, suppress stops everywhere
|
||||
service.set_firing("TestIsSuppressedD", [])
|
||||
service.clear_suppressed(alert_config, [])
|
||||
self.assertFalse(service.is_suppressed(alert_config, alert_tags))
|
||||
self.assertFalse(service.is_suppressed(alert_config, new_alert_tags))
|
||||
|
||||
if __name__ == "__main__" :
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user