cold

2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions
--- a/sleeper_agents_aom_engine/.gitignore
+++ b/sleeper_agents_aom_engine/.gitignore
@@ -0,0 +1,15 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Vagrant template
+.vagrant/
+.idea/
+build/results
+logs/
+
+*.pyc
+.dockerignore
+Dockerfile
+build/builder
+site-packages.tar.gz
+
+alert_configs
+AoM_Configs
--- a/sleeper_agents_aom_engine/.gitmodules
+++ b/sleeper_agents_aom_engine/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "AlertOnMetrics"]
+	path = AoM_Configs
+	url = ssh://git@gitlab-app.eng.qops.net:10022/sleeper-agents/AlertOnMetrics.git
--- a/sleeper_agents_aom_engine/.jenkins/JenkinsFile
+++ b/sleeper_agents_aom_engine/.jenkins/JenkinsFile
@@ -0,0 +1,67 @@
+#!/usr/bin/env groovy
+pipeline {
+    agent {label 'nomad-builder'}
+
+    environment { 
+        DOCKER_HOST = '127.0.0.1:2375'
+        WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
+    }
+    stages {
+        stage('Info') {
+            steps {
+                sh script: 'hostname'
+                echo "WORKSPACE_PATH: $WORKSPACE_PATH"
+            }
+        }
+        stage('Build') {
+            steps {
+                echo "No build required"
+            }
+        }
+        stage('Test') {
+            steps {
+                echo "Test done during merge request"
+                //sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
+            }
+        }
+        stage('Deploy') {
+            steps {
+                script {
+                if ("$GIT_BRANCH" == "origin/master"){
+                        echo "Running publish script"
+                        sh script: './publish.sh'
+                        echo "Triggering Rundeck job"
+                        script {
+                            step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c5323400-0d97-4488-8cf2-1d736a5f7fb9', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
+                        }
+                    }
+                    else {
+                        echo "No deploy step required."
+                    }
+                }
+            }
+        }
+    }
+    post {
+        success {
+            gitlabCommitStatus(name: "$JOB_NAME") {
+                // Test passed, update commit with green checkbox
+            }
+            //  Notify Eng Viz of successful build
+           // slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
+        }
+        failure {
+            gitlabCommitStatus(name: "$JOB_NAME") {
+                // Test failed, update commit status with red x
+                error("Build failed, check ${BUILD_URL} for details.")
+            }
+            // On failure send an email to Eng Vis
+            mail body: 'Please check ${BUILD_URL} or details.',
+              subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
+              from: 'Jenkins',
+              to: 'eng-visibility@qualtrics.com'
+            // Finally send a warning message to Eng Vis slack channel
+            slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
+        }
+    }
+}
--- a/sleeper_agents_aom_engine/.jenkins/JenkinsFileMR
+++ b/sleeper_agents_aom_engine/.jenkins/JenkinsFileMR
@@ -0,0 +1,58 @@
+#!/usr/bin/env groovy
+pipeline {
+    agent {label 'nomad-builder'}
+
+    environment { 
+        DOCKER_HOST = '127.0.0.1:2375'
+        WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
+    }
+    stages {
+        stage('Info') {
+            steps {
+                sh script: 'hostname'
+                echo "WORKSPACE_PATH: $WORKSPACE_PATH"
+            }
+        }
+        stage('Build') {
+            steps {
+                echo "Building AOM container"
+                sh script: 'docker build . -t aom_test_container'
+            }
+        }
+        stage('Test') {
+            steps {
+                echo "Launching container on test mode. It will take a few minutes."
+                sh script: 'docker run -e TEST=true -h $(hostname) --add-host=\"telegraf:$(nslookup jenkins.eng.qops.net|grep Server | awk \'{print $2}\')\" aom_test_container'
+                echo "Removing docker image and container"
+                sh script: 'docker rmi -f aom_test_container'
+            }
+        }
+        stage('Deploy') {
+            steps {
+                echo "No deploy step required for Merge Request"
+            }
+        }
+    }
+    post {
+        success {
+            gitlabCommitStatus(name: "$JOB_NAME") {
+                // Test passed, update commit with green checkbox
+            }
+            //  Notify Eng Viz of successful build
+           // slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
+        }
+        failure {
+            gitlabCommitStatus(name: "$JOB_NAME") {
+                // Test failed, update commit status with red x
+                error("Build failed, check ${BUILD_URL} for details.")
+            }
+            // On failure send an email to Eng Vis
+            mail body: 'Please check ${BUILD_URL} or details.',
+              subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
+              from: 'Jenkins',
+              to: 'eng-visibility@qualtrics.com'
+            // Finally send a warning message to Eng Vis slack channel
+          //  slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
+        }
+    }
+}
--- a/sleeper_agents_aom_engine/README.md
+++ b/sleeper_agents_aom_engine/README.md
@@ -0,0 +1,8 @@
+# IMPORTANT NOTICE:
+
+Alert configurations have been moved to [AlertOnMetrics]
+(https://gitlab-app.eng.qops.net/engvis/AlertOnMetricsConfigs). 
+
+This will allow more flexibility to the project. Merge requests will
+be automatically validated, merged and deployed if it passes the
+validation stage.
--- a/sleeper_agents_aom_engine/aom_service.py
+++ b/sleeper_agents_aom_engine/aom_service.py
@@ -0,0 +1,240 @@
+#!/usr/bin/python3
+""" Alert On Metrics Project"""
+
+import logging
+import multiprocessing
+import json
+import base64
+import os
+import subprocess
+from time import time, sleep
+import requests
+import yaml
+import traceback
+from sanic import Sanic, response
+from library.args import get_service_args
+from serviceapp import service
+from library.config import glob_the_configs
+from library.logger import AlertLogging
+
+LOG = AlertLogging('aom')
+LOG.start()
+LOG.start_log_file("logs/aom_service.log")
+LOG.start_debug()
+
+APP = Sanic()
+SERVICE_JOB = multiprocessing.Value('i', 0)
+NUM_JOBS = multiprocessing.Value('i', 0)
+LEADERSHIP = multiprocessing.Value('i', 0)
+LEADER_STATUS = None
+LEADER_TIME = None
+CONSUL_URL = None
+LEADER_OVERRIDE = None
+HOSTNAME = None
+SERVICE_CONFIG = None
+
+# move to library
+def dict_compare(d1, d2):
+    """
+    Function to compare two dictionaries
+    """
+    d1_keys = set(d1.keys())
+    d2_keys = set(d2.keys())
+    intersect_keys = d1_keys.intersection(d2_keys)
+    added = d1_keys - d2_keys
+    removed = d2_keys - d1_keys
+    modified = set(o for o in intersect_keys if d1[o] != d2[o])
+    return added, removed, modified #, same
+
+@APP.route("/")
+async def index(_):
+    """
+    Return total number of jobs
+    """
+    global NUM_JOBS
+    return response.json({"job_count": NUM_JOBS.value})
+
+@APP.route('/healthcheck')
+async def health(request):
+    """
+    Flask healthcheck so that consul and friends work, see this as a service
+    Returns:
+    json object of status: ok
+    """
+    LOG.debug("healthcheck")
+    service_process = multiprocessing.Process(target=start_service, \
+                    args=(LOG, SERVICE_CONFIG['alert_reload_interval']), \
+                    name="service", daemon=False)
+    # TRY TO START SERVICE, IF LEADER AND NOT RUNNING
+    if SERVICE_JOB.value == 0:
+        LOG.info("Starting alerts background job")
+        SERVICE_JOB.value += 1
+        service_process.start()#start_service(log)
+    return response.json({"status": "ok"}, 200)
+
+
+# @APP.route("/override")
+# async def override(request):
+#     """
+#     Sets the LEADER_OVERRIDE global parameter to force an override
+#     """
+#     global LEADER_OVERRIDE
+#     if request.args.get('enable') == 'true':
+#         LEADER_OVERRIDE = True
+#     elif request.args.get('enable') == 'false':
+#         LEADER_OVERRIDE = False
+#     return response.json({"override": LEADER_OVERRIDE})
+
+
+# def leader():
+#     """
+#     Needs to be implemented that goes out to consul and checks if node is leader,
+#     or if there is no leader volunteers itself.
+#     Returns:
+#         bool of True or False.... once the logic gets worked out
+#     """
+#     global LEADER_STATUS, LEADER_TIME
+#     # CHECK IF THERE IS AN ARGUMENT FOR OVERRIDING THE CHECK LEADER
+#     if LEADER_OVERRIDE is True:
+#         return True
+
+#     # CHECK IF LEADER_TIME IS SET AND THAT IT'S LESS THAN 30 SECONDS FROM LAST SET
+#     if LEADER_TIME is None or time() - LEADER_TIME > 60:
+#         LOG.info("Cache has expired or was not set")
+#         box_hostname = os.environ['HOSTNAME'] if HOSTNAME is None else HOSTNAME
+#         LOG.info("Getting Leader Election status")
+
+#         # RIGHT NOW IN THE CONFIG THIS IS HARD SET TO THE CONSUL1-APP.ENG.OPS.NET
+#         try:
+#             r = requests.get(CONSUL_URL, timeout=60)
+#             assert r.status_code == 200, "Failed to get back a 200 from consul."
+#             LOG.info("Verify that the Value is {}".format(box_hostname))
+
+#             # THE VALUE BACK IS A BASE64 ENCODED BYTE, THAT NEEDS TO BE DECODED,
+#             # TURNED TO A STRING, THEN TO A DICT
+#             value = json.loads(base64.b64decode(r.json()[0]['Value']).decode('utf-8'))
+
+#             # CACHE THE VALUE AND TIMESTAMP
+#             if value['HostName'] == box_hostname:
+#                 LEADER_STATUS = True
+#                 LEADER_TIME = time()
+#                 return True
+#             else:
+#                 return False
+#         except TimeoutError:
+#             LOG.error("Timed out connecting to Consul")
+#             return LEADER_STATUS
+#     else:
+#         return LEADER_STATUS
+
+
+def start_service(log, reload_interval):
+    """
+    Starts the service
+    Args:
+        None
+    Returns:
+        None
+    """
+    jobs = []
+    alert_list = []
+    alert_hash = {}
+    box_hostname = os.environ['HOSTNAME'] if HOSTNAME is None else HOSTNAME
+    production_mode = not "TEST" in os.environ
+    # WAIT FOR LEADER ELECTION TO PASS
+    # while not leader():
+    #     return False
+    # # GLOB ALL THE CONFIG FILES TO BUILD POOL OF ALERTS
+    log.info("Waiting 15s for Consul service to pass")
+    #sleep(15)
+    while True:
+        try:
+            alert_list = glob_the_configs(SERVICE_CONFIG['alert_folder'], \
+                         SERVICE_CONFIG['alert_routing_lookup'], \
+                         'http://consul.service.consul:8500', box_hostname, log)
+        except Exception:
+            log.error("Failed to load config files: {}".format(traceback.format_exc()))
+            # CREATE THREAD POOL, TO PREVENT RECURSIVE INCLUSION WITH
+            # MULTIPROCESSING, MOVED FUNCTION TO ANOTHER FILE
+        log.info("Found {} alerts".format(len(alert_list)))
+        new_alert_hash = {}
+        for alert_config in alert_list:
+            if alert_config['id'] in new_alert_hash.keys():
+                log.info("Duplicate alert id found: {}. \
+                          Ignoring one of them.".format(alert_config['id']))
+            else:
+                new_alert_hash[alert_config['id']] = alert_config
+
+        added, removed, modified = dict_compare(new_alert_hash, alert_hash)
+        log.info("Added alerts {}".format(added))
+        log.info("Removed alerts {}".format(removed))
+        log.info("Modified alerts {}".format(modified))
+        # PROCESSES TO KILL
+        for al_config in removed.union(modified):
+            position = None
+            # Find if process is currently running
+            for  i, job in enumerate(jobs):
+                if job.name == al_config and job.is_alive():
+                    position = i
+                    # once found exit loop
+                    break
+            # Terminate process and remove it from the list
+            log.info("Stopping config: {}".format(jobs[position].name))
+            subprocess.call(["/bin/kill", "-9", "{}".format(jobs[position].pid)])
+            jobs[position].join()
+            NUM_JOBS.value -= 1
+            log.info("Process stopped succesfully")
+            jobs.pop(position)
+        # PROCESSES TO START
+        alert_configurations = added.union(modified)
+        alert_configurations = sorted(alert_configurations, key=lambda x:len(new_alert_hash[x].get('resolvedDependencies').getDependencies()))
+        for al_config in added.union(modified):
+            if new_alert_hash[al_config].get('query_type') == 'prometheus':
+                p = multiprocessing.Process(target=service.check_prometheus_alert, \
+                args=(new_alert_hash[al_config], SERVICE_CONFIG, log, production_mode), \
+                name=al_config, daemon=True)
+            else:
+                p = multiprocessing.Process(target=service.check_kairosdb_alert,\
+                    args=(new_alert_hash[al_config], SERVICE_CONFIG, log, production_mode), \
+                    name=al_config, daemon=True)
+            jobs.append(p)
+            log.info("Starting new config: {}".format(p.name))
+            p.start()
+            NUM_JOBS.value += 1
+        # store current list
+        alert_hash = new_alert_hash.copy()
+        log.info("Total number of jobs: {}".format(NUM_JOBS.value))
+        service.send_stat('total_jobs', NUM_JOBS.value, dict(), statprefix='aom')
+        if added:
+            service.send_stat('new_jobs', len(added), dict(), statprefix='aom')
+        if modified:
+            service.send_stat('modified_jobs', len(modified), dict(), statprefix='aom')
+        if removed:
+            service.send_stat('removed_jobs', len(removed), dict(), statprefix='aom')
+        sleep(reload_interval)
+    #No longer leader killing all processes
+    log.info("No longer leader. Exiting alerts background job")
+    for job in jobs:
+        # job.terminate() causes the server to stop
+        subprocess.call(["/bin/kill", "-9", "{}".format(job.pid)])
+        NUM_JOBS.value -= 1
+    SERVICE_JOB.value = 0
+    return False
+
+if __name__ == "__main__":
+    # GET ARGS AND START LOGGING
+    ARGS = get_service_args()
+    logging.setLoggerClass(AlertLogging)
+    LOG.info("Starting Service")
+    # GET SERVICE CONFIG
+    LEADER_OVERRIDE = ARGS['override']
+    HOSTNAME = ARGS['hostname']
+    SERVICE_CONFIG = yaml.safe_load(open('service.yaml', 'r').read())
+    if ARGS['alert_configs'] is not None:
+        SERVICE_CONFIG['alert_folder'] = ARGS['alert_configs']
+    if ARGS['alert_routing_lookup'] is not None:
+        SERVICE_CONFIG['alert_routing_lookup'] = ARGS['alert_routing_lookup']
+    # SET CONSUL URL FOR LEADER CHECK
+    CONSUL_URL = SERVICE_CONFIG['consul_url']
+    # START THE MAIN SERVICE
+    APP.run(host="0.0.0.0", port=ARGS['port'])
--- a/sleeper_agents_aom_engine/aom_test.py
+++ b/sleeper_agents_aom_engine/aom_test.py
@@ -0,0 +1,121 @@
+import json
+import time
+
+import requests
+import yaml
+
+service_config = yaml.load(open('service.yaml', 'r').read())
+kairos_url = service_config['kairosdb_url'] + "api/v1/datapoints/"
+kairos_query = kairos_url + "query"
+metrics_list = []
+status1 = "RECOVERY"
+status2 = "WARNING"
+status3 = "CRITICAL"
+
+json_string1 = """{"name": "aom_test_metric","datapoints": """
+json_string2 = ""","tags": {"host": "aom_host","data_center": "AOM"},"ttl": 500}"""
+
+# WRITE ALERT CONFIG FILE
+
+alert_file = {'alerts': {'sensu': {'slack': 'aom_test_channel'}},
+              'critical_lower_threshold': 100,
+              'critical_upper_threshold': 5000,
+              'id': 'test_metric',
+              'interval': 30,
+              'occurrences_threshold': 1,
+              'query': {'cache_time': 0,
+                        'end_relative': {'unit': 'seconds', 'value': '30'},
+                        'metrics': [{'name': 'aom_test_metric', 'tags': {}}],
+                        'start_relative': {'unit': 'seconds', 'value': '60'}},
+              'tags': {},
+              'url': 'AOM_TESTING',
+              'warning_lower_threshold': 1000,
+              'warning_upper_threshold': 2000}
+
+query_intro = """{
+        "metrics": [
+          {
+            "tags": {
+              "alert": [
+                "test_metric"
+              ]
+            },
+           "name": "telegraf.aom_"""
+
+query_outro = """_value",
+      "aggregators": [
+        {
+          "name": "sum",
+          "align_sampling": true,
+          "sampling": {
+            "value": "9",
+            "unit": "minutes"
+          },
+          "align_start_time": false
+        }
+      ]
+    }
+  ],
+  "cache_time": 0,
+  "start_relative": {
+    "value": "8",
+    "unit": "minutes"
+  }
+}"""
+
+
+def main():
+    # noinspection PyBroadException
+    try:
+        with open('alert_configs/test.yaml', 'w') as yaml_file:
+            yaml.dump(alert_file, yaml_file, default_flow_style=False)
+    except Exception:
+        print("Error writing alert config file")
+        return False
+
+    now = int(time.time() * 1000)
+    metrics_list.append([now, 1501])
+    now += 32000
+    metrics_list.append([now, 202])
+    now += 32000
+    metrics_list.append([now, 23])
+    now += 32000
+    metrics_list.append([now, 1504])
+    now += 32000
+    metrics_list.append([now, 2005])
+    now += 32000
+    metrics_list.append([now, 5006])
+    now += 32000
+    metrics_list.append([now, 1507])
+    full_string = json_string1 + str(metrics_list) + json_string2
+    try:
+        ret = requests.post(kairos_url, data=json.dumps(json.loads(full_string)), timeout=200)
+        assert ret.status_code == 204, "Wrong status code received from KairosDB"
+    except AssertionError as e:
+        print("Error: {}".format(str(e)))
+    except Exception as e:
+        print("Problem talking to KairosDB: {}".format(str(e)))
+        return False
+    print("Metrics sent to KairosDB. Check alerts in the #aom_test_channel in Slack")
+    time.sleep(360)
+
+    try:
+        ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status1 + query_outro)), timeout=200)
+        print("Recovery {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
+        assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong RECOVERY result"
+        ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status2 + query_outro)), timeout=200)
+        print("Warning {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
+        assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong WARNING result"
+        ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status3 + query_outro)), timeout=200)
+        print("Critical {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
+        assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 4, "Wrong CRITICAL result"
+    except AssertionError as e:
+        print("Error: {}".format(str(e)))
+    except Exception as e:
+        print("Problem getting results from KairosDB: {}".format(str(e)))
+        return False
+    return True
+
+
+if __name__ == '__main__':
+    main()
--- a/sleeper_agents_aom_engine/library/init.py
+++ b/sleeper_agents_aom_engine/library/init.py
--- a/sleeper_agents_aom_engine/library/args.py
+++ b/sleeper_agents_aom_engine/library/args.py
@@ -0,0 +1,163 @@
+# Contians the arg parser options.
+"""Contains the arg parser options."""
+
+
+import argparse
+import sys
+
+
+def get_builder_args():
+    """
+    Gets the arguments passed in to the aom_builder main call
+
+    :return: parser object
+    """
+    parser = argparse.ArgumentParser(
+        description="Generates a valid yaml file "
+        "for alerting on metrics. If you are "
+        "familiar with the yaml structure for an "
+        "alert you don't have to use this builder,"
+        " it's just convenient")
+    parser.add_argument('-q', '--query', help="The Kariosdb query string to "
+                        "use")
+    parser.add_argument(
+        '-i', '--interval', type=int, default=60, help="The "
+        "interval that the check will This value is in seconds")
+    parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
+                        "upper threshold is the value that when reached will "
+                        "cause an depending on the threshold logic. "
+                        "Use in conjunction with lower threshold to define a "
+                        "normal band.")
+    parser.add_argument(
+        '-b',
+        '--lowerthreshold',
+        help="The lower threshold is the value that when reached will cause an "
+        "alert depending on the threshold logic"
+        "Use in conjunction with upper threshold to define a normal band.")
+    parser.add_argument(
+        '-m',
+        '--measure',
+        choices=[
+            'gt',
+            'lt',
+            'eq'],
+        help="The measure to use to compare the "
+        "threshold to the values of the alerts")
+    parser.add_argument(
+        '-a',
+        '--alert_config',
+        help='A valid Yaml representation of your alerting block')
+    parser.add_argument(
+        '-l',
+        '--log_level',
+        type=int,
+        default=0,
+        help="The log level for the aom_builder run. "
+        "[0=Error, 1=Info, 2=Debug]")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=8080,
+        help="The port to run the webapp on")
+
+    return args_to_dict(parser)
+
+
+def get_tester_service_args():
+    """
+    Gets arguments passed into aom_tester.py
+    Returns: parser object
+    """
+    parser = argparse.ArgumentParser(
+        description="Parameters to start the alerting on metrics dummy tester "
+        "service")
+    parser.add_argument(
+        '-l',
+        '--log_level',
+        type=int,
+        default=0,
+        help="The log level for the aom_service app"
+        "[0=Error, 1=Info, 2=Debug]")
+    parser.add_argument(
+        '-a',
+        '--alert_configs',
+        default=None,
+        help="If provided will override the folder location read from the "
+        "config with the value passed in. Is helpful for testing and "
+        "troubleshooting alerts")
+    parser.add_argument(
+        '--hostname',
+        help="If provided, will override the actual hostname check with this "
+        "value")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=8080,
+        help="The port to run the webapp on")
+    return args_to_dict(parser)
+
+
+def get_service_args():
+    """
+    Gets arguments passed into aom_service.py
+    Returns: parser object
+    """
+    parser = argparse.ArgumentParser(
+        description="Parameters to start the alerting on metrics service")
+    parser.add_argument(
+        '-l',
+        '--log_level',
+        type=int,
+        default=0,
+        help="The log level for the aom_service app"
+        "[0=Error, 1=Info, 2=Debug]")
+    parser.add_argument(
+        '-a',
+        '--alert_configs',
+        default=None,
+        help="If provided will override the folder location read from the "
+        "config with the value passed in. Is helpful for testing and "
+        "troubleshooting alerts")
+    parser.add_argument(
+        '--alert_routing_lookup',
+        default=None,
+        help="If provided will override the folder used to fetch the alerts "
+        "lookup configuration.")
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='store_true',
+        help="Overrides the check leader election value")
+    parser.add_argument(
+        '--hostname',
+        help="If provided, will override the actual hostname check with this "
+        "value")
+    parser.add_argument(
+        '-p',
+        '--port',
+        type=int,
+        default=8080,
+        help="The port to run the webapp on")
+    return args_to_dict(parser)
+
+
+def args_to_dict(parsed_args):
+    """
+    Converts the argument parser object to a dict
+    Args:
+        parsed_args: Arg parser object
+    Returns:
+        Dictionary of arguments
+    """
+    try:
+        arg_list = parsed_args.parse_args()
+        # RETURN A DICT OF ARGUMENTS
+        arg_dict = dict()
+        for val in vars(arg_list):
+            arg_dict[val] = getattr(arg_list, val)
+        return arg_dict
+    except argparse.ArgumentError:
+        parsed_args.print_help()
+        sys.exit(1)
--- a/sleeper_agents_aom_engine/library/config.py
+++ b/sleeper_agents_aom_engine/library/config.py
@@ -0,0 +1,277 @@
+# config.py
+"""Functions for loading alert configuration files"""
+import glob
+import os
+import json
+import hashlib
+import yaml
+import requests
+import traceback
+
+# import logging
+# logger = logging.getLogger(__name__)
+
+DEPENDENCIES_KEY = 'dependencies'
+
+class AlertWithDependencies:
+    def __init__(self, alertId, dependencies):
+        self.alertId = alertId
+        self.beenProcessed = False
+        self.dependencies = []
+        self.addAllDependencies(dependencies)
+
+    def addAllDependencies(self, moreDependencies):
+        if moreDependencies is not None:
+            self.dependencies.extend(moreDependencies)
+
+    def getDependencies(self):
+        return self.dependencies
+
+    def getAlertId(self):
+        return self.alertId
+
+    def hasBeenProcessed(self):
+        return self.beenProcessed
+
+    def visit(self):
+        self.beenProcessed = True
+
+def md5(fname):
+    """Calculates md5 hash of a filename"""
+    hash_md5 = hashlib.md5()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+
+def get_healthy_nodes_and_index(consul_url, hostname, logger):
+    """Find AOM healthy nodes on consult"""
+    try:
+        # getting all registered nodes from consul
+        r = requests.get(
+            consul_url +
+            '/v1/catalog/service/alert-on-metrics',
+            timeout=60)
+        assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
+
+        value = json.loads(r.text)
+        node_list = []
+        host_index = -1
+        for elem in value:
+            node_list.append(elem.get('Node'))
+
+        # Retrieving healthy nodes
+        healthy_nodes = []
+        for node in node_list:
+            r2 = requests.get(
+                consul_url +
+                '/v1/health/node/' +
+                node,
+                timeout=60)
+            assert r.status_code == 200, "Failed to get back a 200 from consul health"
+            healthcheck_list = json.loads(r2.text)
+            for check in healthcheck_list:
+                if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
+                        check.get('Status') == 'passing'):
+                    healthy_nodes.append(node)
+
+        try:
+            healthy_nodes.sort()
+            host_index = healthy_nodes.index(hostname)
+        except ValueError:
+            logger.error("Host is not healthy")
+    except TimeoutError:
+        logger.error("Timed out connecting to Consul")
+    return host_index, len(healthy_nodes)
+
+
+def distribute_configs(
+        filename,
+        host_index,
+        module,
+        logger):
+    """Uses md5 of alert config to split the files among healthy servers"""
+    if module == 0:
+        logger.error("No healthy nodes for the service")
+        return False
+    if host_index == -1:
+        logger.error("Host is unhealthy")
+        return False
+    if int(md5(filename), 16) % module == host_index:
+        return True
+    return False
+
+
+def is_valid(alert_config, logger):
+    """Checks if alert has all required fields"""
+    try:
+        assert alert_config['alerts'], "No Alerts configured, this is a dead config"
+        assert alert_config['query'], "No Query, this is a dead config"
+        #assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
+        assert alert_config['id'], "Alert ID is empty, this is a dead config"
+        if DEPENDENCIES_KEY in alert_config:
+            assert isinstance(alert_config[DEPENDENCIES_KEY], list), "Dependencies is specified but isn't a list"
+        if alert_config.get('query_type') == 'prometheus':
+            assert isinstance(
+                alert_config['query'], str), "Invalid Prometheus query"
+        else:
+            assert isinstance(
+                alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
+            defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
+                {'', 'dc', 'fqdn'})
+            # IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
+            if 'group_by' in alert_config['query']['metrics'][0]:
+                defined_tags.update(
+                    set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
+            # for undefined_tag in set(alert_config['tags']).difference(defined_tags):
+            #     print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
+            #           "prevent empty results".format(undefined_tag))
+            # OUR MINIMUM THRESHOLD NEED
+        assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
+            'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
+            "Config must have at least one threshold set."
+
+        # JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
+        # AFTER CRITICAL
+        if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
+            assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
+                "Lower Critical must be less than Lower Warning"
+        if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
+            assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
+                "Upper Critical must be greater than Upper Warning"
+
+        if 'lookup' in alert_config['alerts']:
+            assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
+            assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
+                'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
+            assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
+            assert all(
+                isinstance(
+                    tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
+
+        # if 'occurrences_threshold' in alert_config:
+        #     assert alert_config['occurrences_threshold'] >= 1, \
+        #         "Having an occurrences value less than 2 is assumed and pointless to specify"
+    except Exception as e:
+        logger.warning("Invalid config file: {}".format(str(e)))
+        return False
+    return True
+
+
+def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
+    """Check if routing lookup is properly configured"""
+    try:
+        assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
+        for alert_routing in alert_routing_lookup:
+            assert 'alert' in alert_routing, "No alert defined for this configuration."
+            assert 'tags' in alert_routing, "No tags value defined for this configuration."
+            for tag in alert_routing['tags']:
+                assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
+                    tag)
+            assert all(isinstance(tag, str)
+                       for tag in alert_routing['tags']), "Tags must be valid string"
+    except AssertionError as e:
+        logger.warning("Invalid alert routing config file: {}".format(str(e)))
+        return False
+    return True
+
+
+# noinspection PyBroadException
+def glob_the_configs(
+        config_path,
+        lookup_config_path,
+        consul_url,
+        hostname,
+        logger):
+    """
+    Args:
+        config_path (string): relative path to the configs
+        consul_url (string): url to consul service
+        logger:
+    Returns:
+        List of configs
+    """
+    invalid_configs = 0
+    alert_list = []
+    host_index, module = get_healthy_nodes_and_index(
+        consul_url, hostname, logger)
+    alertToAlertWithDependencies = {}
+    for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
+        logger.debug("Found {} config".format(config_file))
+        # LOAD CONFIG
+        if distribute_configs(
+                config_file,
+                host_index,
+                module,
+                logger):
+            try:
+                alert = yaml.safe_load(open(config_file, 'rb').read())
+                if is_valid(alert, logger):
+                    if 'lookup' in alert['alerts']:
+                        alert_routing_lookup = []
+                        is_valid_lookup = True
+                        if 'lookup_file' in alert['alerts']['lookup']:
+                            lookup_path = "{}/{}".format(
+                                lookup_config_path, alert['alerts']['lookup']['lookup_file'])
+                            if os.path.isfile(lookup_path):
+                                alert_routing_lookup = yaml.safe_load(
+                                    open(lookup_path, 'rb').read())
+                            else:
+                                is_valid_lookup = False
+                        else:
+                            alert_routing_lookup = alert['alerts']['lookup']['lookups']
+
+                        is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
+                            alert_routing_lookup, alert, logger)
+
+                        if is_valid_lookup:
+                            alerts_per_tags = {}
+                            for alert_configuration in alert_routing_lookup:
+                                key = []
+                                for tag in alert['alerts']['lookup']['tags']:
+                                    key.append(
+                                        alert_configuration['tags'].get(tag))
+                                alerts_per_tags[tuple(
+                                    key)] = alert_configuration['alert']
+                            alert['alert_routing_lookup'] = alerts_per_tags
+                        else:
+                            invalid_configs += 1
+                            continue
+                    alertWithDependencies = AlertWithDependencies(alert['id'], alert[DEPENDENCIES_KEY] if DEPENDENCIES_KEY in alert else None)
+                    alertToAlertWithDependencies[alert['id']] = alertWithDependencies
+                    alert['resolvedDependencies'] = alertWithDependencies
+                    alert_list.append(alert)
+                else:
+                    invalid_configs += 1
+            except BaseException:
+                logger.error("Error parsing {} config: {}".format(config_file, traceback.format_exc()))
+    # validate the dependencies and flesh out the dependency graphs
+    logger.debug("Iterating over dependencies")
+    for alertId, alertWithDependencies in alertToAlertWithDependencies.items():
+        validateDependencies(alertId, alertWithDependencies, alertToAlertWithDependencies, logger)
+
+    logger.info("Invalid configs: {}".format(invalid_configs))
+    from serviceapp import service
+    service.send_stat(
+        'invalid_configs',
+        invalid_configs,
+        dict(),
+        statprefix='aom')
+    logger.info("Loaded {} configs".format(len(alert_list)))
+    return alert_list
+
+def validateDependencies(alertId, alertWithDependencies, allAlerts, logger):
+    if len(alertWithDependencies.getDependencies()) > 0:
+        if not alertWithDependencies.hasBeenProcessed():
+            alertWithDependencies.visit()
+            dependencies = list(alertWithDependencies.getDependencies())
+            for dependentId in dependencies:
+                if dependentId not in allAlerts:
+                    logger.info("Invalid dependency of {}: {}".format(alertId, dependentId))
+                else:
+                    alertWithDependencies.addAllDependencies(validateDependencies(dependentId, allAlerts[dependentId], allAlerts, logger))
+        logger.debug("returning alert {} with dependencies {}".format(alertId, alertWithDependencies.getDependencies()))
+        return alertWithDependencies.getDependencies()
+    else:
+        return None
--- a/sleeper_agents_aom_engine/library/logger.py
+++ b/sleeper_agents_aom_engine/library/logger.py
@@ -0,0 +1,122 @@
+# logger.py
+""" Logging configuration """
+
+
+import logging
+import logging.handlers
+import os
+
+logging.getLogger('requests').setLevel(logging.ERROR)
+logging.getLogger('urllib3').setLevel(logging.ERROR)
+logging.getLogger('werkzeug').setLevel(logging.ERROR)
+
+
+class SingleLevelFilter(logging.Filter):
+    def __init__(self, passlevel, reject):
+        """
+        initilizer(constructor) of the singlelevelfilter
+        @param passlevel (int) - the int value of the level of the log
+        @param reject (bool) - if true will return if the record level is
+            not equal to the passlevel
+        @return SingleLevelFilter object
+        @note Sets some object parameters
+        """
+        self.passlevel = passlevel
+        self.reject = reject
+
+    def filter(self, record):
+        """
+        Returns True/False depending on parameters
+        @param record (Log int) - the record that the filter belongs to
+        @return bool - True/False depending on what self.reject is set to and
+                       what record.levelno and self.passlevel are set to
+        @note This causes either only logging of the exact same level to get
+              logged, or only logging other than the same level to get logged
+        """
+        if self.reject:
+            return record.levelno != self.passlevel
+        return record.levelno == self.passlevel
+
+
+class AlertLogging(logging.Logger):
+    """
+    Class Object to handle the logging of the alert on metrics service
+    starts at Error level and can flip on (and add) an additional log file and
+    Debug logger as needed.
+    """
+
+    def __init__(self, name):
+        """
+        Inits the formaters and logger
+        """
+        self.name = name
+        self.debug_formatter = logging.Formatter(
+            "%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
+            "%(message)s", "%m-%d %H:%M:%S")
+
+        self.standard_formatter = logging.Formatter(
+            "%(asctime)s - [%(levelname)s] -  %(message)s", "%m-%d %H:%M:%S")
+        logging.getLogger()
+        logging.Logger.__init__(self, name, logging.DEBUG)
+        logging.setLoggerClass(AlertLogging)
+
+    def start(self):
+        """
+
+        Returns:
+
+        """
+        info_handler = logging.StreamHandler()
+        info_handler.setLevel(logging.INFO)
+        info_handler.setFormatter(self.standard_formatter)
+        self.addHandler(info_handler)
+        return self
+
+    def start_log_file(self, file_path, mode='a'):
+        """
+        Creates a separate log file handler
+        Args:
+            file_path: path to the log file
+            mode: the type of mode to open the file handler with
+        Returns:
+
+        """
+        self.log_path = file_path
+        work_folder = os.path.dirname(file_path)
+        if work_folder and not os.path.exists(work_folder):
+            os.makedirs(work_folder)
+        self.log_handler = logging.FileHandler(file_path, mode)
+        self.log_handler.setLevel(logging.WARNING)
+        self.log_handler.setFormatter(self.debug_formatter)
+        self.addHandler(self.log_handler)
+
+    def stop_log_file(self):
+        """
+        Closes Log file and sets the handler to None
+        Returns:
+
+        """
+        self.log_handler.close()
+        self.removeHandler(self.log_handler)
+        self.log_handler = None
+
+    def start_debug(self):
+        """
+
+        Returns:
+
+        """
+        self.debug_handler = logging.StreamHandler()
+        self.debug_handler.setLevel(logging.DEBUG)
+        self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
+        self.debug_handler.setFormatter(self.debug_formatter)
+        self.addHandler(self.debug_handler)
+
+    def stop_debug(self):
+        """
+        stop the debugger
+        Returns:
+
+        """
+        self.removeHandler(self.debug_handler)
+        self.debug_handler = None
--- a/sleeper_agents_aom_engine/library/prom_api.py
+++ b/sleeper_agents_aom_engine/library/prom_api.py
@@ -0,0 +1,83 @@
+from datetime import datetime, timedelta
+from urllib.parse import urljoin
+
+import requests
+
+
+class PromAPI:
+    def __init__(self, endpoint='http://127.0.0.1:9090/'):
+        """
+        :param endpoint: address of
+        """
+        self.endpoint = endpoint
+
+    @staticmethod
+    def _to_timestamp(input_):
+        """
+        Convert string input to UNIX timestamp for Prometheus
+        :param input_:
+        :return:
+        """
+        if type(input_) == datetime:
+            return input_.timestamp()
+        if input_ == 'now':
+            return datetime.utcnow().isoformat('T')
+        if type(input_) is str:
+            input_ = float(input_)
+        if type(input_) in [int, float]:
+            if input_ > 0:
+                return input_
+            if input_ == 0:  # return now
+                return datetime.utcnow().isoformat('T')
+            if input_ < 0:
+                return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
+        #assert type(input_) == float
+
+    def query(self, query='prometheus_build_info'):
+        return self._get(
+            uri='/api/v1/query',
+            params=dict(
+                query=query
+            )
+        )
+
+    def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
+        """Get ser"""
+        params = {
+            'query': query
+        }
+        if end is not None:
+            params['end'] = self._to_timestamp(end) + 'Z'
+        if start:
+            params['start'] = self._to_timestamp(start) + 'Z'
+        if duration:
+            params['step'] = duration
+        print(params)
+        return self._get(
+            uri='/api/v1/query_range',
+            params=params
+        )
+
+    def series(self, match='prometheus_build_info', start=-86400, end='now'):
+        """Get ser"""
+        params = {
+            'match[]': match
+        }
+        if end is not None:
+            params['end'] = self._to_timestamp(end) + 'Z'
+        if start:
+            params['start'] = self._to_timestamp(start) + 'Z'
+        print(params)
+        return self._get(
+            uri='/api/v1/series',
+            params=params
+        )
+
+    def _get(self, uri, params, method='GET'):
+        url = urljoin(self.endpoint, uri)
+        assert method == 'GET'
+        result = requests.get(
+            url=url,
+            params=params
+        )
+        return result.json()
--- a/sleeper_agents_aom_engine/library/test_config.py
+++ b/sleeper_agents_aom_engine/library/test_config.py
@@ -0,0 +1,47 @@
+import unittest
+import config
+
+class TestAlertWithDependencies(unittest.TestCase) :
+   def test_base(self) :
+      self.alertToAlertWithDependencies = {}
+      self.alert_list = []
+      self.make_alert("A", ["C"])
+      self.make_alert("B", ["C"])
+      self.make_alert("C", ["D"])
+      self.make_alert("D", None)
+      self.validate()
+      self.checkDepLen("A", 2)
+      self.checkDepLen("B", 2)
+      self.checkDepLen("C", 1)
+      self.checkDepLen("D", 0)
+
+   def make_alert(self, id, depends) :
+      alert = {
+         'id': id,
+         'dependencies': depends
+      }
+      alertWithDependencies = config.AlertWithDependencies(alert['id'], alert[config.DEPENDENCIES_KEY] if config.DEPENDENCIES_KEY in alert else None)
+      self.alertToAlertWithDependencies[alert['id']] = alertWithDependencies
+      alert['resolvedDependencies'] = alertWithDependencies
+      self.alert_list.append(alert)
+
+   def validate(self) :
+      for id, awd in self.alertToAlertWithDependencies.items() :
+         config.validateDependencies(id, awd, self.alertToAlertWithDependencies, MockLogger())
+
+   def checkDepLen(self, id, n) :
+      dep = self.alertToAlertWithDependencies[id]
+      self.assertEqual(len(dep.getDependencies()), n)
+
+class MockLogger() :
+   def __init__(self) :
+      return
+   def info(self, *args, **kwargs) :
+      return
+   def debug(self, *args, **kwargs) :
+      return
+   def error(self, *args, **kwargs) :
+      return
+
+if __name__ == "__main__" :
+   unittest.main()
--- a/sleeper_agents_aom_engine/publish.sh
+++ b/sleeper_agents_aom_engine/publish.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+GIT_COMMIT=$(git rev-parse HEAD)
+
+if [[ $GIT_COMMIT == "" ]]; then
+	echo "--Missing required GIT_COMMIT var. Aborting..."
+	exit 1
+fi
+
+#Setup useful vars
+team="engvis"
+app="alert-on-metrics-app"
+
+registryV2="registry-app.eng.qops.net:5001"
+pathV2="${registryV2}/${team}/${app}"
+commitV2="${pathV2}:${GIT_COMMIT}"
+latestV2="${pathV2}:latest"
+
+# In case you use relative paths
+DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
+cd $DIR
+
+echo "--Publishing $app $GIT_COMMIT"
+
+echo "--Removing old image, so they don't accumulate"
+docker rmi $latestV2
+
+#Now fail if anything doesn't work
+set -e
+
+if [ -f $app/build.sh ]
+then
+    echo "--Running pre build steps"
+    $app/build.sh
+fi
+
+docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
+
+echo "--Publishing app container"
+
+docker push $commitV2
+docker push $latestV2
--- a/sleeper_agents_aom_engine/qvolution.sh
+++ b/sleeper_agents_aom_engine/qvolution.sh
@@ -0,0 +1,48 @@
+function _get_and_save_secret() {
+   function is_set() {
+      local name="$1"
+      eval "echo \$$name" | grep . > /dev/null
+   }
+   local name="$1"
+   eval "$name=\${$name:-}"
+   if ! is_set $name; then
+      eval "$name=$(security find-generic-password -a $USER -s $name -w 2> /dev/null)"
+      if ! is_set "$name"; then
+         eval "read -s -p 'Enter $name: ' $name" >&2
+         eval "security add-generic-password -a $USER -s $name -w \$$name" >&2
+         echo "" >&2
+      fi
+   fi
+   eval "echo \$$name"
+}
+function get_and_save_secret() {
+   _get_and_save_secret "$@" | tail -n 1
+}
+SENSU_API_USER="$(get_and_save_secret SENSU_API_USER)"
+SENSU_API_PASS="$(get_and_save_secret SENSU_API_PASS)"
+SLACK_API_TOKEN="$(get_and_save_secret SLACK_API_TOKEN)"
+
+echo SENSU_USER=$SENSU_API_USER >&2
+echo SENSU_PASS=$SENSU_API_PASS >&2
+echo SLACK_TOKEN=$SLACK_API_TOKEN >&2
+
+git submodule update --remote
+rm -rf alert_configs
+cp -r AoM_Configs/alert_configs .
+docker build -t aom:dev .
+
+docker rm -f aom
+docker run \
+   -e SLACK_API_TOKEN=${SLACK_API_TOKEN} \
+   -e API_USER=$SENSU_API_USER \
+   -e API_PASS=$SENSU_API_PASS \
+   --rm \
+   -d \
+   -p 8080:8080  \
+   --add-host telegraf:10.4.13.53 \
+   --name aom \
+   --add-host consul.service.consul:127.0.0.1 \
+   -h 127.0.0.1 \
+   aom:dev &
+until curl localhost:8080/healthcheck; do sleep 1; done
+docker logs -f aom
--- a/sleeper_agents_aom_engine/reporter/incoming/init.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/init.py
--- a/sleeper_agents_aom_engine/reporter/incoming/main.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/main.py
@@ -0,0 +1,14 @@
+import os
+import logging
+from receiver import SlackReceiver
+from sender import SlackSender
+
+if __name__ == "__main__":
+	log = logging.getLogger()
+	log.setLevel(logging.DEBUG)
+	log.addHandler(logging.StreamHandler())
+
+	slack_token = os.environ["SLACK_API_TOKEN"]
+	sender = SlackSender(slack_token, log)
+	receiver = SlackReceiver(slack_token, log, sender.respond)
+	receiver.start()
--- a/sleeper_agents_aom_engine/reporter/incoming/message.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/message.py
@@ -0,0 +1,12 @@
+class SlackMessage() :
+   def __init__(self, payload) :
+      data = payload['data']
+      for i in data :
+         try :
+            setattr(self, i, getattr(data, i))
+         except Exception :
+            setattr(self, i, data[i])
+      attrs = dir(self)
+      assert "text" in attrs, "no text in message"
+      assert "user" in attrs, "no user in message"
+      assert "channel" in attrs, "no channel in message"
--- a/sleeper_agents_aom_engine/reporter/incoming/receiver.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/receiver.py
@@ -0,0 +1,52 @@
+import slack
+import os
+import re
+import ssl as ssl_lib
+import certifi
+
+from message import SlackMessage
+
+class SlackReceiver() :
+   def __init__(self, token, log, callback) :
+      self.token = token
+      self.ssl_context = ssl_lib.create_default_context(cafile=certifi.where())
+      self.callback = callback
+      self.log = log
+
+   def start(self) :
+      self.rtm_client = slack.RTMClient(token=self.token, ssl=self.ssl_context)
+      @slack.RTMClient.run_on(event="message")
+      def receive(**payload) :
+         msg = self.parse(payload)
+         if msg is not None:
+            self.receive(msg)
+      print("Starting")
+      self.rtm_client.start()
+
+   def parse(self, payload) :
+      self.log.debug("slack message received: {}".format(payload))
+      if 'data' in payload and 'bot_id' in payload['data'] and payload['data']['bot_id'] == 'BNYAX72BB':
+         # it's the bot's response, ignore it
+         return None
+      if 'data' in payload and 'user' in payload['data'] and payload['data']['user'] == 'UNS0QKMMY':
+         # it's the bot uploading files, ignore it
+         return None
+      if 'data' in payload and (
+            ('text' in payload['data'] and '<@UNS0QKMMY>' not in payload['data']['text']) 
+            and ('channel' in payload['data'] and not payload['data']['channel'].startswith('DP'))
+         ):
+         # message in a channel and the bot wasn't pinged, or was not a direct message - ignore
+         self.log.debug("received message, but I wasn't pinged or DM'ed")
+         return None
+      if 'data' in payload and 'text' in payload['data'] and '<@UNS0QKMMY>' in payload['data']['text']:
+         # remove the ping text
+         payload['data']['text'] = re.sub('\\<@UNS0QKMMY\\>', '', payload['data']['text'])
+      return SlackMessage(payload)
+
+   def receive(self, msg) :
+      text = msg.text.split()
+      channel = msg.channel
+      ID = text[0] if len(text) > 0 else None
+      interval = text[1] if len(text) > 1 else None
+      step = text[2] if len(text) > 2 else None
+      self.callback(channel, ID, interval, step)
--- a/sleeper_agents_aom_engine/reporter/incoming/sender.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/sender.py
@@ -0,0 +1,131 @@
+import os
+import sys
+import requests
+import traceback
+import time
+import io
+import binascii
+import logging
+import matplotlib.pyplot as plt
+import numpy as np
+import datetime
+import re
+
+rootDirectory = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(rootDirectory)
+from library.config import glob_the_configs
+from library.prom_api import PromAPI
+
+HOSTNAME = None
+
+
+class SlackSender:
+   def __init__(self, token, log):
+      self.token = token
+      self.alertList = []
+      self.log = log
+      try:
+         self.alertList = glob_the_configs(rootDirectory, \
+            rootDirectory + "/AoM_Configs/alert_routing_lookup", \
+            'http://consul.service.consul:8500', '127.0.0.1', log)
+      except Exception:
+         log.error("Failed to load config files: {}".format(traceback.format_exc()))
+
+   def respond(self, channel, alertId, interval, step):
+      self.log.debug("incomming message: channel: {} alert ID: {} interval: {} step: {}".format(channel, alertId, interval, step))
+      matchingAlert = next((alert for alert in self.alertList if alert['id'] == alertId), None)
+      if not matchingAlert is None:
+         query_args = {
+            'interval' : matchingAlert['interval'],
+            'start_time' : matchingAlert['start_time'],
+            'end_time' : matchingAlert['end_time'],
+            'query' : matchingAlert['query'],
+         }
+         prom_api = PromAPI(endpoint=matchingAlert['prometheus_url'])
+         if interval :
+            try:
+               dur = parse_go_duration(interval)
+               if dur < 60 :
+                  dur = 60
+               end = 0
+               start = -1 * dur
+               query_args['start_time'] = start
+               query_args['end_time'] = end
+            except Exception:
+               pass
+         if step :
+            query_args['interval'] = step
+         self.log.debug("QUERY_ARGS {} FROM {} {}".format(query_args, interval, step))
+         ret = prom_api.query_range(
+            query=query_args['query'],
+            start=query_args['start_time'],
+            end=query_args['end_time'],
+            duration=query_args['interval'])
+         if 'status' in ret and ret['status'] == 'success' and 'data' in ret and 'result' in ret['data'] and len(ret['data']['result']) > 0 and 'values' in ret['data']['result'][0] and ret['data']['result'][0]['values'] is not None and len(ret['data']['result'][0]['values']) > 0:
+            resultsForGraph = {}
+            for row in ret['data']['result'][0]['values']:
+               resultsForGraph[row[0]] = row[1]
+            finalResults = {}
+            for res in resultsForGraph:
+               finalResults[time.strftime('%H:%M:%S', time.localtime(res))] = float(resultsForGraph[res])
+            plt.clf()
+            plt.plot(list(finalResults.keys()), list(finalResults.values()))
+            plt.suptitle(alertId + " (all times UTC)")
+            if len(finalResults.keys()) > 5:
+                tickTuples = [(index, x) for index, x in enumerate(finalResults.keys()) if index % int(len(finalResults.keys()) / 5) == 0]
+                tickList = []
+                for pair in tickTuples:
+                    tickList.append(pair[1])
+                plt.xticks(ticks = tickList, rotation='vertical')
+            else:
+                plt.xticks(rotation='vertical')
+            plt.ylim(bottom = 0)
+            plt.subplots_adjust(bottom=0.2)
+            pngData = io.BytesIO()
+            fig = plt.gcf()
+            fig.savefig(pngData, format = 'png')
+            self.sendGraph(channel, pngData)
+         else:
+            self.log.debug("didn't meet criteria")
+            self.sendQueryResults(channel, ret)
+      else:
+         self.sendQueryResults(channel, "Sorry, I couldn't find a matching alert with ID {}".format(alertId))
+
+   def sendQueryResults(self, channelId, queryResults):
+      response = requests.post('https://slack.com/api/chat.postMessage',
+         headers = {
+            'Authorization': "Bearer " + self.token,
+            'Content-Type': 'application/json; charset=utf-8'
+         },
+         json = { 'text': queryResults, 'channel': channelId }
+      )
+      self.log.debug("slack response: {}".format(response.text))
+
+   def sendGraph(self, channelId, rawData):
+      request = requests.Request('POST', 'https://slack.com/api/files.upload',
+         data = { 'token': self.token, 'filetype': 'png', 'channels': channelId },
+         files = { 'file': ('graph.png', rawData.getvalue(), 'image/png')}
+      ).prepare()
+      self.log.debug("headers to send to Slack: {}".format('\r\n'.join('{}: {}'.format(k, v) for k, v in request.headers.items())))
+      self.log.debug("body to send to Slack: {}".format(len(request.body)))
+      response = requests.Session().send(request)
+      self.log.debug("slack response: {}".format(response.text))
+
+   def setAlertList(self, newAlertList):
+      # TODO can rework to be a dictionary for faster lookup if necessary
+      self.alertList = newAlertList
+
+def parse_go_duration(duration) :
+   duration = str(duration)
+   e = Exception("invalid duration "+duration)
+   if not re.match("^[0-9][0-9]*[a-z]$", duration) :
+      raise e
+   unit = duration[-1:]
+   n = int(duration[:-1])
+   if unit == "s" :
+      return n
+   if unit == "m" :
+      return 60*n
+   if unit == "h" :
+      return 60*60*n
+   raise e
--- a/sleeper_agents_aom_engine/reporter/incoming/test.sh
+++ b/sleeper_agents_aom_engine/reporter/incoming/test.sh
@@ -0,0 +1,14 @@
+#! /bin/bash
+
+SLACK_API_TOKEN=${SLACK_API_TOKEN:-}
+if [ -z "$SLACK_API_TOKEN" ]; then
+   SLACK_API_TOKEN=$(security find-generic-password -a $USER -s SLACK_API_TOKEN -w 2> /dev/null)
+   if [ -z "$SLACK_API_TOKEN" ]; then
+      read -s -p "Enter SLACK_API_TOKEN" SLACK_API_TOKEN
+      echo ""
+   fi
+fi
+security delete-generic-password -a $USER -s SLACK_API_TOKEN 2> /dev/null 1>&2
+security add-generic-password -a $USER -s SLACK_API_TOKEN -w $SLACK_API_TOKEN 1>&2
+
+SLACK_BOT_TOKEN=$SLACK_API_TOKEN python3 ./main.py
--- a/sleeper_agents_aom_engine/reporter/incoming/test_message.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/test_message.py
@@ -0,0 +1,16 @@
+import unittest
+
+from message import SlackMessage
+
+class Test_SlackMessage(unittest.TestCase) :
+   def test_basic(self) :
+      def recv(*args, **payload) :
+         return SlackMessage(payload)
+      try :
+         recv(data={"hello":"world"})
+         self.fail("did not raise exception at bad slack message payload")
+      except Exception :
+         pass
+
+if __name__ == "__main__" :
+   unittest.main()
--- a/sleeper_agents_aom_engine/reporter/incoming/test_receiver.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/test_receiver.py
@@ -0,0 +1,14 @@
+import unittest
+
+from receiver import SlackReceiver
+from message import SlackMessage
+
+class Test_SlackReceiver(unittest.TestCase) :
+   def test_basic(self) :
+      r = SlackReceiver("token", print)
+      msg = r.parse(data={"user":"u", "text":"text", "channel":"channel"})
+      self.assertTrue(isinstance(msg, SlackMessage))
+      r.receive(msg)
+
+if __name__ == "__main__" :
+   unittest.main()
--- a/sleeper_agents_aom_engine/reporter/incoming/test_sender.py
+++ b/sleeper_agents_aom_engine/reporter/incoming/test_sender.py
@@ -0,0 +1,35 @@
+import unittest
+import sender
+
+class Test_Compute_Relative_Time(unittest.TestCase) :
+   def test_basic(self) :
+      class MockDateTime() :
+         t = 1000000
+         def __init__(self) :
+            self.datetime = self
+         def fromtimestamp(self, t) :
+            self.t = t
+         def timestamp(self) :
+            return self.t
+         def utcnow(self) :
+            return self
+      mock_date_time = MockDateTime()
+      was = sender.datetime
+      sender.datetime = mock_date_time
+      self.case("1s", 1)
+      self.case("5s", 5)
+      self.case("0s", 0)
+      self.case("0m", 0)
+      self.case("1m", 60)
+      self.case("9m", 9*60)
+      self.case("0h", 0)
+      self.case("1h", 60*60)
+      self.case("9h", 9*60*60)
+      sender.datetime = was
+
+   def case(self, duration, expected) :
+      seconds = sender.parse_go_duration(duration)
+      self.assertEqual(seconds, expected)
+
+if __name__ == "__main__" :
+   unittest.main()
--- a/sleeper_agents_aom_engine/requirements.txt
+++ b/sleeper_agents_aom_engine/requirements.txt
@@ -0,0 +1,11 @@
+PyYAML
+pip
+setuptools
+requests
+pyaml
+sanic
+statsd-tags
+redis
+certifi
+slackclient
+matplotlib
--- a/sleeper_agents_aom_engine/run.sh
+++ b/sleeper_agents_aom_engine/run.sh
@@ -0,0 +1,62 @@
+#!/bin/ash
+
+(
+   while true; do
+      redis-server
+      sleep 10
+   done
+) &
+/usr/src/app/echo-server &
+/usr/src/app/echo-server -p 443 &
+/usr/src/app/consul &
+
+# Default values
+KAIROSDB_URL=${KAIROSDB_URL:-http://kairosdb-metrics.service.eng.consul:8080/}
+SMTP_SERVER=${SMTP_SERVER:-internal-smtp1-app.eng.qops.net:2525}
+SENSU_URL=${SENSU_URL:-https://sensu-api.eng.qops.net:443/results}
+#SLACK_TOKEN=${SLACK_TOKEN:-xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81}
+#VICTOROPS_URL=${VICTOROPS_URL:-https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/}
+#CONSUL_URL=${CONSUL_URL:-http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock}
+#AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-https://grafana.eng.qops.net/d/000000113/alert-on-metrics?refresh=1m&orgId=1&var-dc=All&var-fqdn=All&from=now-6h&to=now&var-id=}
+#UCHIWA_URL=${UCHIWA_URL:-https://uchiwa-app.eng.qops.net/#/client/EngOps/AOM}
+
+SLACK_TOKEN=${SLACK_TOKEN:-na}
+VICTOROPS_URL=${VICTOROPS_URL:-http://localhost:41912/}
+CONSUL_URL=${CONSUL_URL:-http://localhost:41912/}
+AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-http://localhost:41912/}
+UCHIWA_URL=${UCHIWA_URL:-http://localhost:41912/}
+
+export AOM_GRAFANA_URL
+
+# Update config
+sed -i "s#{{{KAIROSDB_URL}}}#${KAIROSDB_URL}#g" service.yaml
+sed -i "s#{{{VICTOROPS_URL}}}#${VICTOROPS_URL}#g" service.yaml
+sed -i "s#{{{SLACK_TOKEN}}}#${SLACK_TOKEN}#g" service.yaml
+sed -i "s#{{{SMTP_SERVER}}}#${SMTP_SERVER}#g" service.yaml
+sed -i "s#{{{CONSUL_URL}}}#${CONSUL_URL}#g" service.yaml
+sed -i "s#{{{SENSU_URL}}}#${SENSU_URL}#g" service.yaml
+sed -i "s,{{{UCHIWA_URL}}},${UCHIWA_URL},g" service.yaml
+# Starting service
+
+if [ -n "${TEST}" ]; then
+    sed -i '/alert_reload_interval:/ s/[0-9]\+/30/g' service.yaml
+    python3 /usr/src/app/aom_service.py &
+    sleep 17
+    echo "Making current server leader"
+    curl localhost:8080/override?enable=true
+    echo "Starting the service"
+    curl localhost:8080/healthcheck
+    exec python3 /usr/src/app/aom_test.py
+    if [ $? -ne 0 ]; then
+        cat /usr/src/app/logs/aom_service.log
+        echo "Test failed!"
+        exit 1
+    else
+        cat /usr/src/app/logs/aom_service.log
+        echo "Test succeeded. Exiting"
+        exit 0
+    fi
+else
+    exec python3 /usr/src/app/reporter/incoming/main.py &
+    exec python3 /usr/src/app/aom_service.py
+fi
--- a/sleeper_agents_aom_engine/service.yaml
+++ b/sleeper_agents_aom_engine/service.yaml
@@ -0,0 +1,27 @@
+#=======================#
+# All them URLS and tokens
+#=======================#
+kairosdb_url: "{{{KAIROSDB_URL}}}"
+victorops_url: "{{{VICTOROPS_URL}}}"
+slack_url: "https://slack.com/api/chat.postMessage"
+slack_token: "{{{SLACK_TOKEN}}}"
+smtp_server: "{{{SMTP_SERVER}}}"
+consul_url: "{{{CONSUL_URL}}}"
+sensu_endpoint: "{{{SENSU_URL}}}"
+uchiwa_url: "{{{UCHIWA_URL}}}"
+#=======================#
+# Logging Information
+#=======================#
+log_path: "logs/aom_service.log"
+
+#=======================#
+# alerts configurations
+#=======================#
+alert_folder: "alert_configs"
+alert_routing_lookup: "alert_routing_lookup"
+alert_reload_interval: 300
+
+#=======================#
+# request timeout value
+#=======================#
+timeout: 90
--- a/sleeper_agents_aom_engine/serviceapp/init.py
+++ b/sleeper_agents_aom_engine/serviceapp/init.py
--- a/sleeper_agents_aom_engine/serviceapp/service.py
+++ b/sleeper_agents_aom_engine/serviceapp/service.py
--- a/sleeper_agents_aom_engine/serviceapp/test_service.py
+++ b/sleeper_agents_aom_engine/serviceapp/test_service.py
@@ -0,0 +1,99 @@
+import unittest
+
+import service
+
+class TestMockRedis(unittest.TestCase) :
+   def test_base(self) :
+      from redis import Redis
+      r = service.get_redis_client()
+      self.assertTrue(not isinstance(r, Redis))
+      self.assertEqual(r.get("a"), None)
+      self.assertEqual(r.set("a", "b"), None)
+      self.assertEqual(r.call("KEYS", "b*"), [])
+   def test_test(self) :
+      r = TestRedis()
+      r.set("a", "b")
+      self.assertEqual(r.get("a"), "b")
+      self.assertEqual(r.call("KEYS", "a*"), ["a"])
+
+DB = None
+class TestRedis(service.MockRedis) :
+   def __init__(self) :
+      global DB
+      if DB is None :
+         DB = {}
+   def get(self, key) :
+      global DB
+      return DB[key] if key in DB else None
+   def delete(self, key) :
+      global DB
+      if key in DB :
+         del(DB[key])
+   def set(self, key, value) :
+      global DB
+      DB[key] = value
+   def call(self, cmd, arg) :
+      global DB
+      if not cmd is "KEYS" :
+         return None
+      return [i for i in DB.keys() if i.startswith(arg.strip("*"))]
+
+class TestSetFiring(unittest.TestCase) :
+   def test_base(self) :
+      def mock_get_redis_client() :
+         return TestRedis()
+      service.get_redis_client = mock_get_redis_client
+      service.set_firing("TestSetFiring", [
+         [0, 0, 0, {"dc": "there"}],
+         [0, 0, 0, {"dc": "here"}],
+      ])
+      self.assertEqual(len(service.list_firing("TestSetFiring")), 2)
+      service.set_firing("TestSetFiring", [
+         [0, 0, 0, {"dc": "here"}],
+      ])
+      self.assertEqual(len(service.list_firing("TestSetFiring")), 1)
+
+class MockResolveDep():
+   def __init__(self, l) :
+      self.l = l
+   def getDependencies(self) :
+      return self.l
+
+class TestIsSuppressed(unittest.TestCase) :
+   def test_base(self) :
+      def mock_get_redis_client() :
+         return TestRedis()
+      service.get_redis_client = mock_get_redis_client
+
+      alert_config = {
+         'id': "TestIsSuppressed",
+         'resolvedDependencies': MockResolveDep(["TestIsSuppressedD", "b", "c"]),
+         'suppressed_occurrences_threshold': 2,
+      }
+      alert_tags = {"dc":"z", "x":"y"}
+
+      #  dependency fires one alert, suppress in effect
+      service.set_firing("TestIsSuppressedD", [[0, 0, 0, alert_tags]])
+      service.clear_suppressed(alert_config, alert_tags)
+      self.assertTrue(service.is_suppressed(alert_config, alert_tags))
+
+      #  dependency still firing alert, suppress stops
+      service.set_firing("TestIsSuppressedD", [[0, 0, 0, alert_tags]])
+      service.clear_suppressed(alert_config, alert_tags)
+      self.assertFalse(service.is_suppressed(alert_config, alert_tags))
+
+      #  dependency in different dc fires alert, suppress in DC1 stops, suppress in DC2 starts
+      new_alert_tags = {"dc":"w"}
+      service.set_firing("TestIsSuppressedD", [[0, 0, 0, new_alert_tags]])
+      service.clear_suppressed(alert_config, new_alert_tags)
+      self.assertFalse(service.is_suppressed(alert_config, alert_tags))
+      self.assertTrue(service.is_suppressed(alert_config, new_alert_tags))
+
+      #  dependencies clear everywhere, suppress stops everywhere
+      service.set_firing("TestIsSuppressedD", [])
+      service.clear_suppressed(alert_config, [])
+      self.assertFalse(service.is_suppressed(alert_config, alert_tags))
+      self.assertFalse(service.is_suppressed(alert_config, new_alert_tags))
+
+if __name__ == "__main__" :
+   unittest.main()