This commit is contained in:
bel
2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions

23
Agents Executable file
View File

@@ -0,0 +1,23 @@
- Supressor - New AoM field to declare dependencies' alerts - if any in dependencies' alerts are firing, then do not check
x AoM is not AoM - it's just configurations
- Each AoM is a process dedicated to looping over 1 alert
- Kills & recreates processes on config change (?)
- Oh my god monolithic functions
- Insert suppress at comment "send all alerts found to the alert handlers..."
- No unittests
- Seems no multiplicity
- serviceapp/service.py
- Floyd-Warshall create fully connected graph on boot/MR as CSV
- Reporter - Slack bot to get graph/latest check by name
- Lookup AoM configs in Gitlab - fetch all on interval with PAT
- configs stored in docker image
- seem to be reloadable on MRs
- Execute query and return
- See nexpose for prometheus, kairos API
- matplotlob.pyplot
- last N values
- warning threshold
- critical threshold
- Visualizer - New AoM field to declare service name and dependent services' names - visible map of services as alerts firing and links between
- Hit uchiwa API for what's firing? How to handle silenced?
- Does AoM have an API for what's firing?

12
AoM_Service/.gitignore vendored Executable file
View File

@@ -0,0 +1,12 @@
# Created by .ignore support plugin (hsz.mobi)
### Vagrant template
.vagrant/
.idea/
build/results
logs/
*.pyc
.dockerignore
Dockerfile
build/builder
site-packages.tar.gz

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = '127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "No build required"
}
}
stage('Test') {
steps {
echo "Test done during merge request"
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
}
}
stage('Deploy') {
steps {
script {
if ("$GIT_BRANCH" == "origin/master"){
echo "Running publish script"
sh script: './publish.sh'
echo "Triggering Rundeck job"
script {
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c5323400-0d97-4488-8cf2-1d736a5f7fb9', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
}
}
else {
echo "No deploy step required."
}
}
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,58 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = '127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "Building AOM container"
sh script: 'docker build . -t aom_test_container'
}
}
stage('Test') {
steps {
echo "Launching container on test mode. It will take a few minutes."
sh script: 'docker run -e TEST=true -h $(hostname) --add-host=\"telegraf:$(nslookup jenkins.eng.qops.net|grep Server | awk \'{print $2}\')\" aom_test_container'
echo "Removing docker image and container"
sh script: 'docker rmi -f aom_test_container'
}
}
stage('Deploy') {
steps {
echo "No deploy step required for Merge Request"
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

12
AoM_Service/AoM_Configs/.gitignore vendored Executable file
View File

@@ -0,0 +1,12 @@
# ignore alert configs starting with underscore -- we can create the while testing the webapp
# and not have to worry about them getting into the repo
alert_configs/_*.yaml
*.swp
.idea/
.vagrant/
__pycache__
logs/
venv/
.vscode/

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = 'tcp://127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "No build required"
}
}
stage('Test') {
steps {
echo "Test done already on merge request"
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
// sh script: 'cd build; ./test_changed.sh'
}
}
stage('Deploy') {
steps {
script {
if ("$GIT_BRANCH" == "origin/master"){
echo "Running publish script"
sh script: './publish.sh'
echo "Triggering Rundeck job"
script {
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c1f0dd4e-89a0-411b-afbb-455421a2ba34', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
}
}
else {
echo "No deploy step required."
}
}
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = 'tcp://127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "No build required"
}
}
stage('Test') {
steps {
echo "Running test"
sh script: './test_changed.sh'
sh script: 'python validate_yaml.py'
}
}
stage('Deploy') {
steps {
echo "No deploy step required for Merge Request"
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,16 @@
FROM registry-app.eng.qops.net:5001/imported/alpine:3.9
MAINTAINER Engineering Visibility <eng-visibility@qualtrics.com>
COPY webapp_requirements.txt /
COPY run_webapp.sh /
RUN apk add --no-cache python3 curl
RUN apk add --no-cache --virtual .build-deps build-base python3-dev \
&& pip3 install --no-cache-dir --upgrade pip \
&& pip3 install --no-cache-dir --upgrade setuptools \
&& pip3 install --no-cache-dir --upgrade -r /webapp_requirements.txt \
&& apk del .build-deps \
&& rm -rf /var/cache/apk/*
CMD ["/run_webapp.sh"]

236
AoM_Service/AoM_Configs/README.md Executable file
View File

@@ -0,0 +1,236 @@
# README
This is the new repository for the Alert On Metrics project configurations.
Alert On Metrics (AOM) project allows one to setup alerts to trigger based on tracking a metric value as collected via [Metrics as a Service](https://odo.corp.qualtrics.com/wiki/index.php/Metrics_As_A_Service). You "track" your metric via a [KairosDB query](http://kairosdb-metrics.service.eng.consul:8080/) or [Prometheus query](http://big-trickster.service.eng.consul:9090/graph) so you are not limited to raw metrics - you can sample based on aggregators available in KairosDB to create new metrics views or use PromQL if you are using Prometheus. Typically people use min, max or count. All "tracked" metrics are rewritten to the metrics data store as a new metric *telgraf.aom_stats_value* but are tagged by Alert-On-Metrics to show their origin.
You can trigger an alert based on any combination of the following:
- An upper critical threshold based on the value of a metric increasing
- An upper warning threshold based on the value of a metric increasing
- A lower critical threshold based on the value of a metric decreasing
- A lower warning threshold based on the value of a metric decreasing
- Combine any lower and upper threshold to create a 'band'
---
## Sensu and alert subdue. NEW!
Some changes have been introduced into latest AOM versions. Now alerts
can be sent through Sensu (email not supported yet). Using Sensu also
allows to create check dependencies (vo is now victorops for Sensu).
```
alerts:
sensu:
victorops:
'blackhole'
slack:
'#aom_test_channel'
dependencies:
- name_of_check1
- name_of_check2
```
Also filters option has been enabled. It works the same way as in
Hiera. If you only want to receive critical alerts through one channel
you can set "channel"_subdue to **true**.
Example:
```
filters:
slack_subdue: true
victorops_subdue: false
```
You can make use of anything that sensu api supports. Anything you add
to your configuration under sensu will be sent directly to the Sensu API.
---
## Availability metric.
If you want to track how long your check is on CRITICAL state along a
given period of time, you can enable this feature by setting this
option to true:
```
availability: true
```
This will start sending metrics constantly and recording the check
output. You can then visualize this metric within the following
[dashboard]
(https://grafana.eng.qops.net/d/5OsrZSdiz/aom-availability?orgId=1)
(or you can create your own).
To get a more accurate result don't set the refresh interval lower
than 60 seconds.
---
## Routing per tag value. NEW!
This feature allows you to configure a different alert routing using the values of tags in your metric. For instance, let's say you want to have a different alert policy for beta, gamma and prod:
* *beta*: I want to alert my `#my-project-dev` channel
* *gamma*: I want to alert my `#my-project-gamma` channel
* *prod*: I want to alert my `#my-project` channel and page the on-call on VictorOps
We can use the `dc` tag available in the metric query, define specific configuration for beta and gamma, and use a default one for all other values (prod in this case). Everything is configured inside the `alerts` object in the yaml configuration. Instead of directly adding the alert configuration, add a `lookup` key. Inside, you have to provide three values:
* `default`: the alert policy to apply by default if we can't find a configuration for a specific combination of tags. The format is the exact same as classic alerts (sensu, vo, slack, etc.).
* `tags`: the tags that will be used to lookup the alert routing configuration. You can use more than one tag.
* `lookups`: an array, where each element specifies a combination of tag values and the routing to apply in this case.
Here is the configuration of our example:
```yaml
alerts:
lookup:
default:
sensu:
slack: my-project
victorops: my-on-call-key
tags:
- dc
lookups:
-
alert:
sensu:
slack: my-project-dev
tags:
dc: b1-prv
-
alert:
sensu:
slack: my-project-gamma
tags:
dc: g1-iad
```
You can move the `lookups` part inside a separate file, so it can be reused accross different AOM configurations. To do that, instead of a `lookups` key, provide a `lookup_file` with the filename, including the extension:
```yaml
alerts:
lookup:
default: ...
lookup_file: my_lookup_file.yaml
tags: ...
```
Save this file under the `alert_routing_lookup` folder. The syntax for the alert routing is the same as before, it is just in a different file:
```yaml
---
-
alert:
sensu:
slack: my-project-dev
tags:
dc: b1-prv
-
alert:
sensu:
slack: my-project-gamma
tags:
dc: g1-iad
```
---
## How do I register a new alert with AOM?
Alert configurations for AOM are just a Kairos DB or Prometheus query
specified in a yaml format and wrapped in some controlling
configuration that determines how frequently the query is executed,
thresholds, occurrences and where to route the alerts. We have built a
small UI that is packaged with the AOM gitlab project that will help
you generate a suitable yaml configuration. You can rehearse your
queries on the [KairosDB UI]
(http://kairosdb-metrics.service.eng.consul:8080/) or at any
Prometheus endpoint and take a look at other examples in the alert_configs/ folder for help.
Follow the instructions below to launch the yaml generator UI on your
local desktop and use it to generate a merge request (Docker is
necessary).
1. Clone the project
2. cd into the project's directory
3. Run the script ./generate_config.sh
4. Once up, navigate in a browser to **localhost:80/**
5. Fill out the form and click generate
6. Hit **Crlt+C** when you have the alert configuration
7. Submit the merge request in a new branch
---
This process will starts a local webserver that provides a convenient interface for generating the yaml you need.
Most of the fields have helpful info tips on what each value is and how it's used.
---
## Visualization tool [BETA]
Along with the project, a simple python script to show how your
metrics will look like and to help you setting the thresholds, is
provided. This tool requires the installation of python3 and some
additional python3 modules:
1. yaml
2. json
3. requests
4. numpy
5. matplotlib
These modules should be easy to install using 'pip' or 'homebrew'.
Usage:
```python3 show_config.py [X] alertname_without_yaml_extension```
Where X is an optional parameter to define the interval lenght you
want to display. It's a multiplier factor, set to 10 by default, that
will increase the start_relative (so you will see more datapoints).
The script should open a window showing the metrics along the defined
thresholds. If the query doesn't return any value, it will exit.
---
## How does my new alert get to production?
Once you submit a merge request, a Jenkins' job will quickly validate your alert
files just checking it contains all required fields and proper syntax. Setting up
appropriate thresholds and alerting channels (VictorOps, email,
Slack) is user's responsibility.
If Jenkins returns a PASS result for the test, new alert files will be
merged into the master branch and a deploy job will be triggered (also
from Jenkins). AOM service will be actively looking for changes in the
alert_configs folder and will pick up any changes (by default every
300 seconds).
## Helpful Tidbits
__IMPORTANT:__ The alert id field must be unique, it might be useful running the
grep command within the alert_configs directory to ensure it's not
already defined.
Use the [UI](http://kairosdb-metrics.service.eng.consul:8080/) on the kairosdb box to help you generate / determine the proper query.
Remember, you want to get the query down to just one or 2 entries per *group-by* so that the service can quickly iterate over it.
Once the request has been merged you can check if your query is getting processed by [hitting the url](http://alert-on-metrics.service.eng.consul:8080/healthcheck?verbose=true)
You can also check out the [grafana dashboard](http://grafana-metrics.service.eng.consul:3000/dashboard/db/alert-on-metrics) that has the results of this service's queries and verify your alert metric is showing up regularly.
From KairosDB's doc: *You must specify either start_absolute or start_relative but not
both. Similarly, you may specify either end_absolute or end_relative
but not both. If either end time is not specified the current date and
time is assumed.* We suggest the usage of *end_relative* (greater than
1 minute) as this will make steadier graphs (if you draw a graph until
*Now*, some of the latest metrics could be missing so the end of the
graph will be lower than it should).
We do not recommend using *align_sampling* and *align_start_time*
(both false by default so can be skipped) as they might change the alignment of metrics
and change graphs over time (*If more than one are set, unexpected results will occur*).
If you have any doubt about KairosDB's query metrics you can take a look at their documentation [here](https://kairosdb.github.io/docs/build/html/restapi/QueryMetrics.html).
---
## The Gotchas
1. Alerts only fire when KairosDB returns a result. If your KairosDB metric query returns no results for X (currently 10) attempts any active alerts will clear with a message explaining that AOM could not get any further results from KairosDB so user must manually verify RECOVERY. Earlier versions of AOM had no flap protection like this built in. Long term we will move alerting to Sensu which has more advanced built in flap protection. You can reduce flapping of results by building your Kairos query well. Please talk to engineering visibility for help with this.
2. Metrics are only collected every 60 seconds, so setting an interval below that will automatically get bumped up to 60 seconds from the web based config generation. Match up the interval by how often the metric is collected and measured
3. The Email field only requires a list of names, and not the @qualtrics bit, as it will only send to qualtrics addresses using the internal-smtp1-app.eng.qops.net box
4. Email and Slack alerts fire once during an event. This way if an outtage was occuring, you wouldn't get flooded with emails and slack alerts the entire time.
5. Email and Slack alerts can be helpful to share with the team so they are aware of what is happening.
6. Email and Slack alerts can be helpful when trying to figure out your alerts before you VO stuff

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_engine_failing
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 24
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

View File

@@ -0,0 +1,18 @@
---
id: sleeper_agents_milleniumfalcon_fuellevel_low
service: fuel
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
service: captain
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 48
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['core']

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_shields_unavailable
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
suppressed_occurrences_threshold: 54
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

View File

@@ -0,0 +1,8 @@
---
-
alert:
slack:
- "public-api-deploy-tst"
tags:
canaryTest: transaction_import_distribution_1
targetdc: fra1

View File

@@ -0,0 +1,365 @@
---
-
alert:
sensu:
slack: es-qe-alerts
tags:
brandId: aexpfeedback
-
alert:
sensu:
slack: emea-alerts
victorops: profserv-19
tags:
brandId: airbuswea
-
alert:
sensu:
slack: es-alaskaair
tags:
brandId: alaskaair
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: amdocs
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: americanairlines
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: anz
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: arris
-
alert:
sensu:
slack: emea-alerts
victorops: profserv-19
tags:
brandId: baincx
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: bmocx
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwgroupne
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwgroupnest3
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwjapan
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwjapanst3
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwna
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwnast3
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwvertriebsgmbh
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwvertriebsgmbhst3
-
alert:
sensu:
slack: caterpillar
victorops: profserv-14
tags:
brandId: catcustomerinsights
-
alert:
sensu:
slack: century-link
victorops: xmp-seattle-4
tags:
brandId: centurylink
-
alert:
sensu:
slack: xmp-seattle-4
victorops: xmp-seattle-4
tags:
brandId: ciscoengineering
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: clientdashboards
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: cms
-
alert:
sensu:
slack: TODO
tags:
brandId: cocacolaperform
-
alert:
sensu:
slack: dish
tags:
brandId: dishvoc
-
alert:
sensu:
slack: es-alerts
tags:
brandId: dowcorning
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: drtoddhall
-
alert:
sensu:
slack: es-gs-compare
victorops: xmp-seattle-3
tags:
brandId: goldmansachs
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: harvard
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: ibm
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: jcibuildings
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: johnsoncontrols2
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: kubota
-
alert:
sensu:
slack: liberty-mutual
tags:
brandId: libertymutualvoc
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: marriottvacationclub
-
alert:
sensu:
slack: es-alerts
tags:
brandId: mastercard
-
alert:
sensu:
slack: xmp-seattle-4
victorops: xmp-seattle-4
tags:
brandId: nielsenapac
-
alert:
sensu:
slack: TODO
tags:
brandId: optumrx
-
alert:
sensu:
slack: xmp-seattle-4
victorops: xmp-seattle-4
tags:
brandId: nielsenscarborough
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: rogers
-
alert:
sensu:
slack: es-alerts
tags:
brandId: samsungeurope
-
alert:
sensu:
slack: emea-alerts
victorops: profserv-19
tags:
brandId: telenorreporting
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: thermoking
-
alert:
sensu:
slack: philips-es
tags:
brandId: tnsnipophilips
-
alert:
sensu:
slack: travelers_coord
victorops: profserv-14
tags:
brandId: travelers
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhcdr
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhcmr
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhcgm
-
alert:
sensu:
slack: TODO
tags:
brandId: uhg
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhg1
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: underarmour
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: unum
-
alert:
sensu:
slack: TODO
tags:
brandId: usaast3
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: usbank
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: uscd
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: walkersandbox

View File

@@ -0,0 +1,30 @@
#! /usr/bin/python3
# aom_builder.py
# point of the builder is to generate a valid yaml config that could be read in to the main app by
# asking for clarifying questions on what to check and how to alert on it
# this comes from 4 questions:
# When to query
# What to query for
# Whats an alert
# Who to Alert
from webapp import app
from library.logger import AlertLogging
from library.args import get_builder_args
log = AlertLogging('aom')
log.start()
log.start_log_file("logs/aom_builder.log")
if __name__ == "__main__":
# GET ARGS AND START LOGGING
args = get_builder_args()
# logger.init("logs/aom_builder.log", args['log_level'])
# aom_logger = logging.getLogger(__name__)
log.info("Logger Initialized")
# ENABLE SESSIONS TO KEEP YAML FILE STATE BETWEEN PAGES
log.info("Starting webapp")
app.run(host='localhost', port=args['port'], debug=True)

View File

@@ -0,0 +1,16 @@
#!/bin/bash
trap ctrl_c INT
function ctrl_c() {
docker stop aom_web
docker ps -a | awk '{ print $1,$2 }' | grep aom_web | awk '{print $1 }' | xargs -I {} docker rm {}
}
docker build -f Dockerfile.webapp -t aom_web . && \
docker run -d -v$(pwd):/web -p80:5000 --name aom_web aom_web && \
docker logs -f aom_web

View File

View File

@@ -0,0 +1,84 @@
# Contians the arg parser options.
import argparse
import sys
def get_builder_args():
"""
Gets the arguments passed in to the aom_builder main call
:return: parser object
"""
parser = argparse.ArgumentParser(description="Generates a valid yaml file for alerting on metrics. "
"If you are familiar with the yaml structure for an alert"
"you don't have to use this builder, it's just convenient")
parser.add_argument('-q', '--query', help="The Kariosdb query string to use")
parser.add_argument('-i', '--interval', type=int, default=60, help="The interval that the check will run. "
"This value is in seconds")
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The upper threshold is the value that when reached will cause an alert "
"depending on the threshold logic. "
"Use in conjunction with lower threshold to define a normal band.")
parser.add_argument('-b', '--lowerthreshold', help="The lower threshold is the value that when reached will cause an alert "
"depending on the threshold logic"
"Use in conjunction with upper threshold to define a normal band.")
parser.add_argument('-m', '--measure', choices=['gt', 'lt', 'eq'], help="The measure to use to compare the "
"threshold to the values of the alerts")
parser.add_argument('-a', '--alert_config', help='A valid Yaml representation of your alerting block')
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_builder run. "
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
return args_to_dict(parser)
def get_tester_service_args():
"""
Gets arguments passed into aom_tester.py
Returns: parser object
"""
parser = argparse.ArgumentParser(description="Parameters to start the alerting on metrics dummy tester service")
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument('-a', '--alert_configs', default=None,
help="If provided will override the folder location read from the config with the value passed "
"in. Is helpful for testing and troubleshooting alerts")
parser.add_argument('--hostname', help="If provided, will override the actual hostname check with this value")
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
return args_to_dict(parser)
def get_service_args():
"""
Gets arguments passed into aom_service.py
Returns: parser object
"""
parser = argparse.ArgumentParser(description="Parameters to start the alerting on metrics service")
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument('-a', '--alert_configs', default=None,
help="If provided will override the folder location read from the config with the value passed "
"in. Is helpful for testing and troubleshooting alerts")
parser.add_argument('-o', '--override', action='store_true', help="Overrides the check leader election value")
parser.add_argument('--hostname', help="If provided, will override the actual hostname check with this value")
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
return args_to_dict(parser)
def args_to_dict(parsed_args):
"""
Converts the argument parser object to a dict
Args:
parsed_args: Arg parser object
Returns:
Dictionary of arguments
"""
try:
arg_list = parsed_args.parse_args()
# RETURN A DICT OF ARGUMENTS
arg_dict = dict()
for val in vars(arg_list):
arg_dict[val] = getattr(arg_list, val)
return arg_dict
except argparse.ArgumentError:
parsed_args.print_help()
sys.exit(1)

View File

@@ -0,0 +1,22 @@
# config.py
import logging
import glob
import yaml
logger = logging.getLogger(__name__)
def glob_the_configs(config_path):
"""
Args:
config_path (string): relative path to the configs
Returns:
List of configs
"""
alert_list = []
for config_file in glob.glob(config_path + "/*.yaml"):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
alert_list.append(yaml.load(open(config_file, 'rb').read()))
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list

View File

@@ -0,0 +1,118 @@
# logger.py
import logging
import logging.handlers
import os
logging.getLogger('requests').setLevel(logging.ERROR)
logging.getLogger('urllib3').setLevel(logging.ERROR)
logging.getLogger('werkzeug').setLevel(logging.ERROR)
class SingleLevelFilter(logging.Filter):
def __init__(self, passlevel, reject):
"""
initilizer(constructor) of the singlelevelfilter
@param passlevel (int) - the int value of the level of the log
@param reject (bool) - if true will return if the record level is not equal to the passlevel
@return SingleLevelFilter object
@note Sets some object parameters
"""
self.passlevel = passlevel
self.reject = reject
def filter(self, record):
"""
Returns True/False depending on parameters
@param record (Log int) - the record that the filter belongs to
@return bool - True/False depending on what self.reject is set to and what record.levelno and self.passlevel are set to
@note This causes either only logging of the exact same level to get logged, or only logging other than the same level to get logged
"""
if self.reject:
return (record.levelno != self.passlevel)
else:
return (record.levelno == self.passlevel)
class AlertLogging(logging.Logger):
"""
Class Object to handle the logging of the alert on metrics service
starts at Error level and can flip on (and add) an additional log file and
Debug logger as needed.
"""
def __init__(self, name):
"""
Inits the formaters and logger
"""
self.name = name
self.debug_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - %(message)s", "%m-%d %H:%M:%S")
self.standard_formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s",
"%m-%d %H:%M:%S")
logging.getLogger()
logging.Logger.__init__(self, name, logging.DEBUG)
logging.setLoggerClass(AlertLogging)
def start(self):
"""
Returns:
"""
info_handler = logging.StreamHandler()
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(self.standard_formatter)
self.addHandler(info_handler)
return self
def start_log_file(self, file_path, mode='a'):
"""
Creates a separate log file handler
Args:
file_path: path to the log file
mode: the type of mode to open the file handler with
Returns:
"""
self.log_path = file_path
work_folder = os.path.dirname(file_path)
if len(work_folder) > 0 and not os.path.exists(work_folder):
os.makedirs(work_folder)
self.log_handler = logging.FileHandler(file_path, mode)
self.log_handler.setLevel(logging.DEBUG)
self.log_handler.setFormatter(self.debug_formatter)
self.addHandler(self.log_handler)
def stop_log_file(self):
"""
Closes Log file and sets the handler to None
Returns:
"""
self.log_handler.close()
self.removeHandler(self.log_handler)
self.log_handler = None
def start_debug(self):
"""
Returns:
"""
self.debug_handler = logging.StreamHandler()
self.debug_handler.setLevel(logging.DEBUG)
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
self.debug_handler.setFormatter(self.debug_formatter)
self.addHandler(self.debug_handler)
def stop_debug(self):
"""
stop the debugger
Returns:
"""
self.removeHandler(self.debug_handler)
self.debug_handler = None

View File

@@ -0,0 +1,42 @@
#!/bin/bash
GIT_COMMIT=$(git rev-parse HEAD)
if [[ $GIT_COMMIT == "" ]]; then
echo "--Missing required GIT_COMMIT var. Aborting..."
exit 1
fi
#Setup useful vars
team="engvis"
app="alert-on-metrics-configs"
registryV2="registry-app.eng.qops.net:5001"
pathV2="${registryV2}/${team}/${app}"
commitV2="${pathV2}:${GIT_COMMIT}"
latestV2="${pathV2}:latest"
# In case you use relative paths
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
cd $DIR
echo "--Publishing $app $GIT_COMMIT"
echo "--Removing old image, so they don't accumulate"
docker rmi $latestV2
#Now fail if anything doesn't work
set -e
if [ -f $app/build.sh ]
then
echo "--Running pre build steps"
$app/build.sh
fi
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
echo "--Publishing app container"
docker push $commitV2
docker push $latestV2

6
AoM_Service/AoM_Configs/run.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/sh
rsync -a --delete /alert_configs/ /mountpoint/configs/git/
rsync -a --delete /alert_routing_lookup/ /mountpoint/alert_routing_lookup/
ls -l /mountpoint/configs/git/

View File

@@ -0,0 +1,5 @@
#!/bin/ash
export FLASK_APP=/web/aom_webapp.py
export FLASK_DEBUG=1
cd /web; flask run --host=0.0.0.0

View File

@@ -0,0 +1,25 @@
#=======================#
# All them URLS and tokens
#=======================#
kairosdb_url: "http://kairosdb-metrics.service.eng.consul:8080/"
victorops_url: "https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/"
slack_url: "https://slack.com/api/chat.postMessage"
slack_token: "xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81"
smtp_server: "internal-smtp1-app.eng.qops.net:2525"
consul_url: "http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock"
sensu_endpoint: "https://sensu-api.eng.qops.net:443/results"
#=======================#
# Logging Information
#=======================#
log_path: "logs/aom_service.log"
#=======================#
# alerts folder
#=======================#
alert_folder: "alert_configs"
#=======================#
# request timeout value
#=======================#
timeout: 90

View File

@@ -0,0 +1,104 @@
import glob
import yaml
import json
import os
import sys
import time
import re
import requests
import numpy
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import random
import warnings
warnings.filterwarnings("ignore")
#from pdb import set_trace as bp
timeout = 180
# if no argument print help and exit
if len(sys.argv) == 1:
print("You need to specify an alert config file.")
exit(1)
#else
config_file = 'alert_configs/'+sys.argv[1]+'.yaml'
# test file exists or exit
alert_config = yaml.load(open(config_file, 'rb').read())
# We will show 10 intervals by default
if len(sys.argv) == 3:
interval = int(sys.argv[2])
else:
interval = 10
alert_config['query']['start_relative']['value'] = str(int(alert_config['query']['start_relative']['value'])*interval)
kairosdb_url = "http://kairosdb-metrics.service.eng.consul:8080/"
query_url = os.path.join(kairosdb_url + "api/v1/datapoints/query")
#ret = requests.post(query_url, data=json.dumps(query), timeout)
ret = requests.post(query_url, json.dumps(alert_config['query']), timeout)
results = ret.json()['queries'][0]['results']
# Transforming to human readable data
# for result in results[0]['values']:
# result[0] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(result[0]/1000))
# result[0] = datetime.datetime.strptime(result[0],'%Y-%m-%d %H:%M:%S')
for result in results:
for value in result['values']:
# bp()
# transform date from epoch to human readable format
value[0] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value[0]/1000))
# transform date string to datetime object
value[0] = datetime.datetime.strptime(value[0],'%Y-%m-%d %H:%M:%S')
series = numpy.array(result['values'])
label_str = str(result['group_by'][0].get('group', ''))
line_color = tuple(numpy.random.random(size=3))
plt.plot_date(series[:,0],series[:,1], marker='.', color=line_color, linestyle='-', label=label_str)
#series = numpy.array(results[0]['values'])
#converted_dates = map(datetime.datetime.strptime, datelist, len(datelist)*['%Y-%m-%d %H:%M:%S'])
#x_axis = (converted_dates)
formatter = mdates.DateFormatter('%H:%M:%S')
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
# series = series.astype(numpy.unicode, copy=False)
ax = plt.subplot()
#ax.set_xlabel('TIME')
#ax.set_ylabel('VALUE')
#bc = plt.axes()
#bc.xaxis.set_major_formatter(formatter)
#plt.plot_date(series[:,0],series[:,1], marker='o', color='b', linestyle='-')
#plt.plot_date(converted_dates,series[:,1], marker='o', color='b', linestyle='-')
#ax.set_xticks(series[:,0])
#ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
#ax.xaxis.set_minor_formatter(mdates.DateFormatter("%Y-%m"))
# ax = plt.subplot.gcf().axes[0]
#ax.set_title(sys.argv[1])
ax.xaxis.set_major_formatter(formatter)
#plt.xaxis.set_major_formatter(formatter)
plt.title(sys.argv[1])
plt.legend()
# pyplot.gcf().autofmt_xdate(rotation=25)
#ax.xaxis_date()
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
# ax.xaxis.set_minor_formatter(mdates.DateFormatter("%Y-%m"))
# ax.plot(series[:,0],series[:,1], marker='o', color='b', linestyle='-')
myRe = re.compile('^(?!occurrences).*_threshold$')
# Adding thresholds to the graph
for key in alert_config:
if myRe.match(key):
plt.axhline(y=float(alert_config[key]), color='r', linestyle='--', label=str(key))
plt.text(series[0][0],float(alert_config[key]),key)
#plt.gcf().autofmt_xdate()
#ax = .add_axes([0,0,1,1])
plt.gcf().autofmt_xdate(rotation=25)
#plt.axhline(y=500000, color='o', linestyle='-')
plt.show()
#results[0]['values']

View File

@@ -0,0 +1,30 @@
#!/bin/bash
set -x
if [ -z $GIT_COMMIT ]; then
echo "Expected env var 'GIT_COMMIT' to be set. Exiting..."
exit 1
fi
echo "Check that only alert confings are being pushed"
echo "$PWD"
for file in $(git diff-tree -r --name-only ${GIT_COMMIT}^1 ${GIT_COMMIT}); do
new_id=$(grep ^id\: $file)
if [ ! -z "$new_id" ]; then
total_id=$(grep "$new_id" alert_configs/*.yaml | wc -l)
if [ $total_id -gt 1 ] ; then
echo "Duplicated id found! Please update the id of the alert configuration"
exit 1
fi
fi
dir=$(dirname ${file})
# alert_configs/ change triggers a test of the new or changed aler configs
if [ "$dir" == "alert_configs" ] || [ "$dir" == "alert_routing_lookup" ] ; then
echo "Good to merge"
exit 0
else
echo "Only automatic merges allowed for alert config files"
exit 1
fi
done

View File

@@ -0,0 +1,60 @@
import yaml
import glob
if __name__ == "__main__":
alert_list = []
bad_alert_list = []
print("Collecting all yaml configs")
# COLLECT CONFIG FILES
for config_file in glob.glob("./alert_configs/*.yaml"):
print("Found {} config".format(config_file))
alert_list.append(config_file)
print("Collecting all yaml configs")
# PARSE CONFIG FILES AND VALIDATE THEIR VALUES
for alert in alert_list:
print("Validating file {}".format(alert))
try:
config = yaml.load(open(alert, 'rb').read())
assert len(config['alerts']) > 0, "No Alerts configured, this is a dead config"
assert len(config['query']) > 0, "No Query, this is a dead config"
assert config['interval'] >= 30, "Intervals less than 30 are invalid"
assert len(config['id']) > 0, "Alert ID is empty, this is a dead config"
if config.get('query_type') == 'prometheus':
assert type(config['query']) is str, "Invalid Prometheus query"
assert "$" not in config['query'], "Prometheus query should not contain variables"
else:
assert type(config['query']) is dict, "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(config['query']['metrics'][0]['tags'].keys()).union({'','dc','fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in config['query']['metrics'][0]:
defined_tags.update(set(config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in config or 'critical_upper_threshold' in config or \
'warning_lower_threshold' in config or 'warning_upper_threshold' in config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING AFTER CRITICAL
if 'warning_lower_threshold' in config and 'critical_lower_threshold' in config:
assert config['critical_lower_threshold'] < config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in config and 'critical_upper_threshold' in config:
assert config['critical_upper_threshold'] > config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'occurrences_threshold' in config:
assert config['occurrences_threshold'] >= 1, \
"Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
print("Invalid config file: {}\n{}".format(alert, str(e)))
bad_alert_list.append("{}\n{}".format(alert, str(e)))
# WRITE OUT BAD CONFIGS TO THE RESULTS FILE
# with open("./results/test_results.log", "w+") as f:
# for alert in bad_alert_list:
# f.write("Config is bad: {}".format(alert.replace('\n', ' ')))
for alert in bad_alert_list:
print("Config is bad: {}".format(alert.replace('\n', ' ')))
if bad_alert_list:
exit(1)

View File

@@ -0,0 +1,7 @@
from flask import Flask, render_template, request, session
app = Flask(__name__)
app.config['SESSION_TYPE'] = 'filesystem'
app.config['SECRET_KEY'] = 'super secret key'
import webapp.views

View File

@@ -0,0 +1,139 @@
import yaml
import os
import json
import traceback
import sys
from library.logger import AlertLogging
logger = AlertLogging('aom')
logger.start()
def render_config(config):
"""
Reads in the config dict and renders to file. config usually from web interface
Args:
config: The config to use to generate the yaml file
Returns:
boolean string of 0 if successful and the yaml as string, or 1 and the error
"""
try:
# GET THE NAME OF THE FILE FROM THE CONFIG
file_name = ''.join([config['alert_name'], '.yaml'])
logger.debug("Filename: {}".format(file_name))
# THIS SHOULD BE A PARAMETER PASSED IN
file_path = os.path.join('alert_configs', file_name)
logger.debug("Full path: {}".format(file_path))
# SANITIZE THE CONFIG TO A NEW OBJECT
yaml_config = {'alerts': {},
'id': config['alert_name'],
'interval': 30 if int(config['interval']) < 30 else int(config['interval'])}
# SET THE INTERVAL TO lowest value of 30 seconds
# SPLIT THE ALERTS INTO A LIST
if 'vo' in config:
yaml_config['alerts']['vo'] = [x for x in config['vo_list'].split(',') if x]
if 'email' in config:
yaml_config['alerts']['email'] = [x for x in config['email_list'].split(',') if x]
if 'slack' in config:
yaml_config['alerts']['slack'] = [x for x in config['slack_list'].split(',') if x]
# GET THRESHOLDS AS FLOATS
if 'critical_threshold' in config:
if config['critical_upper_threshold'] is not "":
yaml_config['critical_upper_threshold'] = float(config['critical_threshold'])
if 'critical_upper_threshold' in config:
if config['critical_upper_threshold'] is not "":
yaml_config['critical_upper_threshold'] = float(config['critical_upper_threshold'])
if 'warning_threshold' in config:
yaml_config['warning_upper_threshold'] = float(config['warning_threshold'])
if 'warning_upper_threshold' in config:
yaml_config['warning_upper_threshold'] = float(config['warning_upper_threshold'])
if 'critical_lower_threshold' in config:
if config['critical_lower_threshold'] is not "":
yaml_config['critical_lower_threshold'] = float(config['critical_lower_threshold'])
if 'warning_lower_threshold' in config:
yaml_config['warning_lower_threshold'] = float(config['warning_lower_threshold'])
if 'occurrences' in config:
yaml_config['occurrences_threshold'] = int(config['occurrences_threshold'])
# PARSE THE QUERY OUT INTO A DICT OBJECT
if config['prometheus_query']:
yaml_config['query_type'] = 'prometheus'
yaml_config['prometheus_url'] = config['prometheus_url']
yaml_config['query'] = config['prometheus_query']
yaml_config['start_time'] = config['start_time']
yaml_config['end_time'] = config['end_time']
else:
yaml_config['query_type'] = 'kairosdb'
yaml_config['query'] = json.loads(config['kairosdb_query'])
# GET THE TAGS, COMMA SEPARATED
tags = config['tags'].split(',')
yaml_config['tags'] = [x for x in tags if x]
# GET THE URL
yaml_config['url'] = config['url']
# WRITE TO FILE
yaml_str = yaml.dump(yaml_config, default_flow_style=False, explicit_start=True)
with open(file_path, 'w') as f:
f.write(yaml_str)
return 0, yaml_str
except json.decoder.JSONDecodeError:
return 1, "Query string is not valid json: {}".format(traceback.format_stack())
except Exception as e:
logger.error("Unable to render yaml config file to disk")
_, _, ex_traceback = sys.exc_info()
return 1, render_traceback(e, ex_traceback)
def render_yaml(alert_id):
"""
Reads in a yaml file into the config that the web expects.
Args:
alert_id: then name of the config
Returns:
Dictionary
"""
file_name = ''.join([alert_id, '.yaml'])
file_path = os.path.join('alert_configs', file_name)
config = yaml.load(open(file_path, 'r').read())
yaml_config = dict()
yaml_config['alert_name'] = config['id']
yaml_config['interval'] = config['interval']
if 'critical_threshold' in config:
yaml_config['critical_upper_threshold'] = config['critical_threshold']
if 'critical_upper_threshold' in config:
yaml_config['critical_upper_threshold'] = config['critical_upper_threshold']
if 'critical_lower_threshold' in config:
yaml_config['critical_lower_threshold'] = config['critical_lower_threshold']
if 'warning_threshold' in config:
yaml_config['warning_upper_threshold'] = config['warning_threshold']
if 'warning_upper_threshold' in config:
yaml_config['warning_upper_threshold'] = config['warning_upper_threshold']
if 'warning_lower_threshold' in config:
yaml_config['warning_lower_threshold'] = config['warning_lower_threshold']
if 'occurrences_threshold' in config:
yaml_config['occurrences_threshold'] = config['occurrences_threshold']
yaml_config['url'] = config['url']
if 'email' in config['alerts']:
yaml_config['email'] = 'on'
yaml_config['email_list'] = ','.join(config['alerts']['email'])
if 'vo' in config['alerts']:
yaml_config['vo'] = 'on'
yaml_config['vo_list'] = ','.join(config['alerts']['vo'])
if 'slack' in config['alerts']:
yaml_config['slack'] = 'on'
yaml_config['slack_list'] = ','.join(config['alerts']['slack'])
if 'tags' in config:
yaml_config['tags'] = ','.join(config['tags'])
if config.get('query_type') == 'prometheus':
yaml_config['prometheus_query'] = config['query']
yaml_config['prometheus_url'] = config['prometheus_url']
yaml_config['start_time'] = config['start_time']
yaml_config['end_time'] = config['end_time']
else:
yaml_config['kairosdb_query'] = json.dumps(config['query'], sort_keys=True, indent=4, separators=(',', ': '))
return yaml_config
def render_traceback(ex, ex_traceback):
tb_lines = traceback.format_exception(ex.__class__, ex, ex_traceback)
logger.exception("Exception")
return '\n'.join(tb_lines)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,29 @@
body { font-family: sans-serif; background: #eee; }
a, h1, h2 { color: #377BA8; }
h1, h2 { font-family: 'Georgia', serif; margin: 0; }
h1 { border-bottom: 2px solid #eee; }
h2 { font-size: 1.2em; }
.page { margin: 2em auto; width: 45em; border: 5px solid #ccc;
padding: 0.8em; background: white; }
.entries { list-style: none; margin: 0; padding: 0; }
.entries li { margin: 0.8em 1.2em; }
.entries li h2 { margin-left: -1em; }
.add-entry { font-size: 0.9em; border-bottom: 1px solid #ccc; }
.add-entry dl { font-weight: bold; }
.metanav { text-align: right; font-size: 0.8em; padding: 0.3em;
margin-bottom: 1em; background: #fafafa; }
.flash { background: #CEE5F5; padding: 0.5em;
border: 1px solid #AACBE2; }
.error { background: #F0D6D6; padding: 0.5em; }
/#.button { border-top: 2px solid #a3ceda;
border-left: 2px solid #a3ceda;
border-right: 2px solid #4f6267;
border-bottom: 2px solid #4F6267;
padding: 1px 20px !important;
font-size: 14px !important;
background-color: #CEE5F5;
font-weight: bold;
color: #2d525d; }
#/
.container { width: 500px; clear: both;}

View File

@@ -0,0 +1,28 @@
{% extends "header.html" %}
{% block body %}
<h2>Form Elements</h2><br />
<table>
{% for key, value in query.items() %}
<tr>
<th> {{ key }} </th>
<td> {{ value }} </td>
</tr>
{% endfor %}
</table><br/>
<p>
{{ query.alert_name }}
</p>
<h2>Rendered Config File</h2><br />
<p>{{ file_path }}</p>
<p>
{% for line in file_contents %}
<div>{{ line|safe }}</div>
{% endfor %}
</p>
<br />
<form action="{{ url_for('re_build', alert_id=query.alert_name) }}" id="re_build" method="post">
<p>
<input type="submit" id="submit" class="btn btn-primary" value="Return to Form?">
</p>
</form>
{% endblock %}

View File

@@ -0,0 +1,6 @@
{% extends "header.html" %}
{% block body %}
<h1>Error Rendering config:</h1>
<p>{{ message }}</p>
<p><a href="{{ url_for('index') }}">Return to Creation Page?</a></p>
{% endblock %}

View File

@@ -0,0 +1,67 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-COMPATIBLE" content="IE=edge">
<meta name="viewport" content="width=device-width, intial-scale=1">
<title>Alerting On Metrics Yaml Builder</title>
<link rel=stylesheet type=text/css href="{{ url_for('static', filename='bootstrap.min.css') }}">
<link rel="stylesheet" type=text/css href="{{ url_for('static', filename='style.css') }}">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
<script src="{{ url_for('static', filename='bootstrap.min.js') }}"></script>
<script type="text/javascript">
function dynInput(cbox) {
console.log(cbox)
if (cbox.checked) {
var input = document.createElement("input");
input.type = "text";
input.id = cbox.name + "_list";
input.name = cbox.name + "_list";
document.getElementById("insertinputs_" + cbox.name).appendChild(input);
} else {
document.getElementById(cbox.name + "_list").remove();
}
}
function dynEnable(cbox) {
console.log(cbox);
var theId = "#" + cbox.name + "_list";
console.log(theId);
if (cbox.checked){
$(theId)[0].disabled = false;
} else {
$(theId)[0].disabled = true;
}
}
function dynThreshold(cbox) {
var theId = "#" + cbox.name + "_threshold";
if (cbox.checked){
$(theId)[0].disabled = false;
} else {
$(theId)[0].disabled = true;
}
}
function forceLower(strInput){
strInput.value=strInput.value.toLowerCase().replace(" ","_");
}
function forceComma(strInput){
strInput.value=strInput.value.replace(" ",",");
}
function forcePositive(strInput){
if (parseInt(strInput.value) <= 1) {
strInput.value = 2
}
}
</script>
</head>
<body>
<div class=page>
{% block body %}{% endblock %}

View File

@@ -0,0 +1,966 @@
{% extends "header.html" %}
{% block body %}
<form action="{{url_for('index')}}" id="builder" method="post" class="form-horizontal">
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Alert Meta</h3>
</div>
</div>
<!-- Alert Name -->
<div class="form-group">
<div class="col-sm-4">
<label for="alert_name" class="control-label">Alert Name:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#alertidModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="alertidModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="myModalLabel">Alert Name</h4>
</div>
<div class="modal-body">
<p>The alert name acts as both the name of the .yaml file and the id for the alert. The
alert name becomes part of what shows up in the title / subject when an alert is
triggered</p>
<p>Picking an alert name that already exists will overwrite the .yaml configuration file so
be aware of what you choose</p>
<p>The Alert name is also how this alert will show up in Victorops, Slack and Email
(Depending on what options you choose for the Alerting</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="alert_name" class="form-control" name="alert_name" value="{{ alert_name }}"
onkeyup="return forceLower(this);">
</div>
</div>
<!-- Check Interval -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="interval">Check Interval: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#intervalModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="intervalModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="checkInterval">Check Interval</h4>
</div>
<div class="modal-body">
<p>The check interval is how often the check will run the query (in seconds) and measure the
results</p>
<p>Anything less than 30 seconds will automatically be bumped up
to 30 seconds. This is due to the fact that metrics are collected every 30 seconds, so
checking more often than this would just result in the same values returned from the
query
as nothing would have changed yet</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="number" id="interval" class="form-control" name="interval" value="{{ interval }}">
</div>
</div>
<!-- Upper Critical Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="criticalUpperThreshold">Upper Critical Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#criticalUpperThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="criticalUpperThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="criticalUpperThresholdTitle">Critical Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query exceeds this
number, a critical alert will trigger.</p>
<p>Only Critical Alerts will also trigger emails and slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<input type="number" class="form-control" id="criticalUpperThreshold" name="critical_upper_threshold"
value="{{ critical_upper_threshold }}"
step="0.01"
onkeypress="validate(event)">
</div>
</div>
<!-- Lower Critical Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="criticalLowerThreshold">Lower Critical Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#criticalLowerThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="criticalLowerThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="criticalLowerThresholdTitle">Lower Critical Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query drops below this
number, a critical alert will trigger.</p>
<p>Only Critical Alerts will also trigger emails and slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<input type="number" class="form-control" id="lower_criticalThreshold" name="critical_lower_threshold"
value="{{ critical_lower_threshold }}"
step="0.01"
onkeypress="validate(event)">
</div>
</div>
<!-- Upper Warning Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="warningUpperThreshold">Upper Warning Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#warningUpperThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="warningUpperThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="warningUpperThresholdTitle">Upper Warning Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query exceeds this
number, a warning alert will trigger.</p>
<p>Warnings will not trigger Email or Slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if warning_upper_threshold %}
{% set warning_upper_checked='checked' %}
{% else %}
{% set warning_upper_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="warning_upper" id="warning_upper" aria-label="..." onclick="dynThreshold(this);" {{
warning_upper_checked }}>
</span>
<input type="number" name="warning_upper_threshold" class="form-control" id="warning_upper_threshold"
value="{{ warning_upper_threshold }}"
aria-label="..." step="0.01" {{ warning_upper_disabled }}>
</div>
</div>
</div>
<!-- Lower Warning Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="warningLowerThreshold">Lower Warning Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#warningLowerThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="warningLowerThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="warningLowerThresholdTitle">Lower Warning Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query drops below this
number, a warning alert will trigger.</p>
<p>Warnings will not trigger Email or Slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if warning_lower_threshold %}
{% set warning_lower_checked='checked' %}
{% else %}
{% set warning_lower_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="warning_lower" id="warning_lower" aria-label="..." onclick="dynThreshold(this);" {{
warning_lower_checked }}>
</span>
<input type="number" name="warning_lower_threshold" class="form-control" id="warning_lower_threshold"
value="{{ warning_lower_threshold }}"
aria-label="..." step="0.01" {{ warning_lower_disabled }}>
</div>
</div>
</div>
<!-- Occurrences -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="occurrences_threshold">Frequency: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#occurrencesModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="occurrencesModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="occurrencesTitle">Frequency</h4>
</div>
<div class="modal-body">
<p>The occurrences value, when set, will determine how many times the alert has to exceed the
threshold in order for an alert to trigger.</p>
<p>This is particularly useful for metrics that can be spikey and resolve quickly,
using occurrences allows you to only be alerted when a spike is no longer spiking but
maintaining the rate over the period of time</p>
<p>This is compared once every interval, so if your alert is set to 5 minutes, with a
occurrences of 3, you'd have to have the threshold exceeded for 15 minutes before any
alerts
are sent out.</p>
<p>The occurrences value is optional, and if not enabled, the service assumes that after 1 query
exceeding the threshold is enough to trigger alerts. So in this way having an occurrences value
set
to 1 or not enabled does the same thing.</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if occurrences_threshold and occurrences_threshold is number and occurrences_threshold > 1 %}
{% set occurrences_checked='checked' %}
{% else %}
{% set occurrences_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="occurrences" id="occurrences" aria-label="..."
onclick="dynThreshold(this);" {{
occurrences_checked }}>
</span>
<input type="number" name="occurrences_threshold" class="form-control" id="occurrences_threshold"
value="{{ occurrences_threshold }}"
aria-label="..." step="1" min="2" {{ occurrences_disabled }} onkeyup="return forcePositive(this);">
</div>
</div>
</div>
<!-- Tags -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="tags">Tags:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#tagsModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="tagsModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="tagsTitle">Tags</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of tags used to include in the alert subject</p>
<p>In the event of an alert, the tags will be used to look up distinctive
information and
include as part of the alert</p>
<p>For example including the dc tag in an alert means that if an alert occurs, the
alerting
system will look up the dc value from the returned query and included it as part
of the
alert subject</p>
<p>These are the same tag values used to build kiarosdb queries</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" name="tags" id="tags" class="form-control" value="{{ tags }}" ,
onkeyup="return forceComma(this);">
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Notifications</h3>
</div>
</div>
<!-- VictorOps Alerts -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="vo">VictorOps Alert:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#voModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="voModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="voTitle">Victor Ops Alert List</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of victorops routing keys</p>
<p>In the event of an alert, the Ids listed here will recieve a victorops alert</p>
<p>If the checkbox isn't selected, when generating the .yaml config the values
listed will
be ignored</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if vo=="on" %}
{% set vo_checked='checked' %}
{% else %}
{% set vo_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="vo" id="vo" aria-label="..." onclick="dynEnable(this);" {{ vo_checked
}}>
</span>
<input type="text" class="form-control" name="vo_list" id="vo_list" aria-label="..."
value="{{ vo_list }}" onkeyup="return forceComma(this);" {{ vo_disabled }}>
</div>
</div>
</div>
<!-- Email Alerts -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="email">Email Alert:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#emailModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="emailModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="emailTitle">Email Alert List</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of email names to send alerts to</p>
<p>In the event of an alert, the names listed here will recieve an email alert</p>
<p>The alerting system appends an @qualtrics.com to the names listed here, so there
is no
need to include the @domain as it's assumed all alerting emails would go to a
qualtrics
address</p>
<p>Also the SMTP server can only send to @qualtrics addresses anyways</p>
<p>For example sending an email to both netops and devops on an alert would be <b>devops,netops</b>
in the text box.</p>
<p>If the checkbox isn't selected, when generating the .yaml config the values
listed will
be ignored</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if email=="on" %}
{% set email_checked='checked' %}
{% else %}
{% set email_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="email" id="email" aria-label="..." onclick="dynEnable(this);" {{
email_checked }}>
</span>
<input type="text" name="email_list" class="form-control" id="email_list"
value="{{ email_list }}"
aria-label="..." onkeyup="return forceComma(this);" {{ email_disabled }}>
</div>
</div>
</div>
<!-- Slack Alert List -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="slack">Slack Alert:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#slackModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="slackModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="slackTitle">Slack Alert List</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of slack names to send alerts to</p>
<p>In the event of an alert, the names listed here will recieve a slack alert from a
slackbot</p>
<p>You must include a @ for direct message alerts and # for channel alerts</p>
<p>For example, if the DevOps team wanted to get an alert in slack, the value in the
text
box would be <b>#devops</b>.
If I wanted to also include a direct message as well then the value would be
<b>#devops,@codyc</b></p>
<p>Don't troll people with your metric alerts bombing peopls slack, it's unkind</p>
<p>If the checkbox isn't selected, when generating the .yaml config the values
listed will
be ignored</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if slack=="on" %}
{% set slack_checked='checked' %}
{% else %}
{% set slack_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="slack" id="slack" aria-label="..." onclick="dynEnable(this);" {{
slack_checked }}>
</span>
<span id="insertinputs_slack"></span>
<input type="text" name="slack_list" class="form-control" id="slack_list"
value="{{ slack_list }}"
aria-label="..." onkeyup="return forceComma(this);" {{ slack_disabled }}>
</div>
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Dashboard</h3>
</div>
</div>
<!-- Dashboard URL -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="query">Dashboard URL:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#dashboardModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="dashboardModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="dashboardTitle">Dashboard URL</h4>
</div>
<div class="modal-body">
<p>Most queries are built based on some dashboard already built in grafana</p>
<p>By including the URL to that dashboard, the oncall engineer recieving the alert
will be
able to click the link in the alert and get a better picture of what this alert
is and
and how it relates to the datacenter</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<input type="text" name="url" id="url" class="form-control" value="{{ url }}">
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Kairosdb Query</h3>
</div>
</div>
<!-- KairosDB Query -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="query">KariosDB Query:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#queryModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="queryModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="queryTitle">KariosDB Query</h4>
</div>
<div class="modal-body">
<p>Paste in your KariosDB Query that you have already worked out.</p>
<p>You can generate your query by going to the <a
href="http://kairosdb-metrics.service.eng.consul:8080/" target="_blank">KariosDB
UI
in eng</a></p>
<p>When generating your metric you will want to get the return values down to just 1
or 2
results per grouping. This can be done by sending the query to the MAX or MIN
aggregators (depending on your logic needs) as the last aggregator in the
query</p>
<p>You will also want to include a time offset, typically 5 minutes is used for when
to
start (as from 5 minutes ago to now). Setting the MAX aggregator to this value
is
usually typical</p>
<p>Once you have generated your query and it's returning the results you expect,
click the
<b>Show Query</b> button on the kairosDB UI and copy the results into this field
</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<textarea name="kairosdb_query" id="kairosdb_query" class="form-control" rows="12" cols="50">{{ kairosdb_query }}</textarea>
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Prometheus Query</h3>
</div>
</div>
<!-- Prometheus URL -->
<div class="form-group">
<div class="col-sm-4">
<label for="prometheus_url" class="control-label">Prometheus URL:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#prometheusurlModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="prometheusurlModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="myModalLabel">Prometheus URL</h4>
</div>
<div class="modal-body">
<p>URL for the prometheus server</p>
<p>Shared, production Prometheus URLs are currently:
<ul>
<li>http://big-trickster.service.eng.consul:9090</li>
</ul>
</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="prometheus_url" class="form-control" name="prometheus_url" value="{{ prometheus_url }}"
onkeyup="return forceLower(this);">
</div>
</div>
<!-- Prometheus Query -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="query">Prometheus Query:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#prometheusQueryModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="prometheusQueryModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="queryTitle">Prometheus Query</h4>
</div>
<div class="modal-body">
<p>Paste in your Prometheus Query that you have already worked out.</p>
<p>You can generate your query by going to the url of your prometheus endpoint. Eng Vis plans on adding a smart router for this in the future so all instances will be exposed via a single smart proxy, but for now you'll need to know the name. </p><p><a
href="http://big-trickster.service.eng.consul:9090/graph" target="_blank">Prometheus Host Metrics
UI
in eng</a>
</p><p>
<a
href="http://big-trickster.service.eng.consul:9090/graph" target="_blank">Prometheus StatsD and other Metrics
UI
in eng</a></p>
<p>When creating a query, keep in mind a single value returned is gonna be the most
useful
, so stuff like "topk(1,yourmetrics)" are gonna be good choices. However, if
your query has multiple return values AOM will use last value.</p>
<p>So if you use a step/duration of 60 and a timspan of 300 between start
and
end you'll get back 5 values and the last will be used.
</p>
<p><a href="https://prometheus.io/docs/prometheus/latest/querying/functions/" target="_blank">Prometheus Functions</a></p>
<p>
<a href="https://prometheus.io/docs/prometheus/latest/querying/operators/" target="_blank">Prometheus Operators</a>
</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<textarea name="prometheus_query" id="prometheus_query" class="form-control" rows="12" cols="50">{{ prometheus_query }}</textarea>
</div>
</div>
<!-- Start Time -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="start_time">Start Time: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#startTimeModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="startTimeModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="startTime">Start Time</h4>
</div>
<div class="modal-body">
<p>This should be a relative time in seconds like '-600' for 10m, defaults to '-300'</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="start_time" class="form-control" name="start_time" value="{{ start_time }}">
</div>
</div>
<!-- End Time -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="end_time">End Time: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#endTimeModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="endTimeModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="endTime">End Time</h4>
</div>
<div class="modal-body">
<p>This can be 'now' (default) or some relative offset like '-30' in seconds</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="end_time" class="form-control" name="end_time" value="{{ end_time }}">
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Actions</h3>
</div>
</div>
<!-- Load Config File -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="loadFile">Load Config From File:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#loadModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="loadModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="loadTitle">Load Config from file</h4>
</div>
<div class="modal-body">
<p>Load a config already generated to file into the UI</p>
<p>This is handy when you need to make minor changes to a query, or add additional
alerting
values or change thresholds. Or if you are just terrified of yaml.</p>
<p>Hit the drop down to see a list of all alert configs (the names generated from
the values
used in the Alert Name field) Hit the Go and the config will load into all the
fields</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<select name="loadFile" id="loadFile" class="form-control">
<option value="" selected></option>
{% for f in alert_list %}
<option value="{{ f }}">{{ f }}</option>
{% endfor %}
</select>
<span class="input-group-btn">
<input type="submit" name="generate" id="submitFiles" class="btn btn-primary" value="Go">
</span>
</div>
</div>
</div>
<!-- Submit Form -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="submit">Generate YAML:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#generateModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="generateModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="generateTitle">Generate Alert Config</h4>
</div>
<div class="modal-body">
<p>When you are ready to take the values in the form and generate a alert config
.yaml file,
hit the button</p>
<p>This will generate a .yaml file based on the alert name. So for example if one
was to
have the value <b>mcp_errors_per_dc</b> as an alert name, the resulting file
would be
<b>mcp_errors_per_dc.yaml</b></p>
<p>This <b>will</b> overwrite a .yaml file if the alert name is the same as an
already
existing file</p>
<p>If there are any errors generating the config, the resulting page will include
the error
message and give you the ability to return back to this page with your form
saved</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="submit" id="submit" name='generate' class='btn btn-primary' value="generate"
class="button">
</div>
</div>
</form>
{% endblock %}

View File

@@ -0,0 +1,4 @@
{% extends "header.html" %}
{% block body %}
<h2>Complete all values in the form below</h2>
{% endblock %}

View File

@@ -0,0 +1,69 @@
# views.py
import glob
import json
import os
import yaml
from flask import session
from library.logger import AlertLogging
from webapp import app, render_template, request, render
logger = AlertLogging('aom')
logger.start()
logger.start_log_file("logs/aom_service.log")
@app.route('/', methods=['GET', 'POST'])
def index():
logger.debug("Request Method: {}".format(request.method))
if request.method == 'GET':
# GET BLOB OF FILES
service_config = yaml.load(open('service.yaml', 'r').read())
alert_list = sorted([os.path.splitext(os.path.basename(x))[0] for x in
glob.glob(service_config['alert_folder'] + "/*.yaml")])
if 'yaml_config' in session:
return render_template('index.html', **json.loads(session['yaml_config']), alert_list=alert_list)
else:
return render_template('index.html', alert_list=alert_list)
elif request.method == 'POST':
logger.info("Got a form")
if 'go' in request.form['generate'].lower():
return re_build(request.form['loadFile'])
yaml_config = dict()
ret = ''
try:
for field_name, value in request.form.items():
yaml_config[field_name] = value
code, ret = render.render_config(yaml_config)
assert code == 0
return render_template('debug.html', query=yaml_config,
file_path='alert_configs/{}.yaml'.format(yaml_config['alert_name']),
file_contents=ret.split('\n'))
except AssertionError:
session['yaml_config'] = json.dumps(yaml_config)
return render_template('error.html', message="Failed to render to file: {}".format(ret))
except Exception as e:
return render_template('error.html', message=str(e))
@app.route('/build/<alert_id>', methods=['POST'])
def re_build(alert_id):
# READ IN CONFIG FROM ID
config = render.render_yaml(alert_id)
service_config = yaml.load(open('service.yaml', 'r').read())
alert_list = sorted([os.path.splitext(os.path.basename(x))[0] for x in
glob.glob(service_config['alert_folder'] + "/*.yaml")])
return render_template('index.html', **config, alert_list=alert_list)
@app.route("/debug/")
def toggle_debug():
if logger.debug_handler:
logger.stop_debug()
logger.info("Debug Stopped")
else:
logger.start_debug()
logger.debug("Debug Started")
return index()

View File

@@ -0,0 +1,3 @@
requests
pyaml
flask

8
AoM_Service/README.md Executable file
View File

@@ -0,0 +1,8 @@
# IMPORTANT NOTICE:
Alert configurations have been moved to [AlertOnMetrics]
(https://gitlab-app.eng.qops.net/engvis/AlertOnMetricsConfigs).
This will allow more flexibility to the project. Merge requests will
be automatically validated, merged and deployed if it passes the
validation stage.

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_engine_failing
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 24
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

View File

@@ -0,0 +1,18 @@
---
id: sleeper_agents_milleniumfalcon_fuellevel_low
service: fuel
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
service: captain
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 48
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['core']

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_shields_unavailable
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
suppressed_occurrences_threshold: 54
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

81
AoM_Service/aom_service.py Executable file
View File

@@ -0,0 +1,81 @@
#!/usr/bin/python3
""" Alert On Metrics Project"""
import logging
import multiprocessing
import json
import base64
import os
import subprocess
from time import time, sleep
import requests
import yaml
from sanic import Sanic, response
from library.args import get_service_args
from library.config import glob_the_configs
from library.logger import AlertLogging
from library.service import Service
LOG = AlertLogging('aom')
LOG.start()
LOG.start_log_file("logs/aom_service.log")
APP = Sanic()
SERVICE_JOB = multiprocessing.Value('i', 0)
NUM_JOBS = multiprocessing.Value('i', 0)
LEADERSHIP = multiprocessing.Value('i', 0)
LEADER_STATUS = None
LEADER_TIME = None
CONSUL_URL = None
LEADER_OVERRIDE = None
HOSTNAME = None
SERVICE_CONFIG = None
@APP.route("/")
async def index(_):
"""
Return total number of jobs
"""
global NUM_JOBS
return response.json({"job_count": NUM_JOBS.value})
@APP.route('/healthcheck')
async def health(request):
"""
Flask healthcheck so that consul and friends work, see this as a service
Returns:
json object of status: ok
"""
LOG.debug("healthcheck")
service_process = multiprocessing.Process(target=start_service, \
args=(LOG, SERVICE_CONFIG['alert_reload_interval']), \
name="service", daemon=False)
# TRY TO START SERVICE, IF LEADER AND NOT RUNNING
if SERVICE_JOB.value == 0:
LOG.info("Starting alerts background job")
SERVICE_JOB.value += 1
service_process.start()#start_service(log)
return response.json({"status": "ok"}, 200)
def start_service(log, reload_interval):
s = Service(log, reload_interval, HOSTNAME, SERVICE_CONFIG)
s.start()
if __name__ == "__main__":
# GET ARGS AND START LOGGING
ARGS = get_service_args()
logging.setLoggerClass(AlertLogging)
LOG.info("Starting Service")
# GET SERVICE CONFIG
LEADER_OVERRIDE = ARGS['override']
HOSTNAME = ARGS['hostname']
SERVICE_CONFIG = yaml.safe_load(open('service.yaml', 'r').read())
if ARGS['alert_configs'] is not None:
SERVICE_CONFIG['alert_folder'] = ARGS['alert_configs']
if ARGS['alert_routing_lookup'] is not None:
SERVICE_CONFIG['alert_routing_lookup'] = ARGS['alert_routing_lookup']
# SET CONSUL URL FOR LEADER CHECK
CONSUL_URL = SERVICE_CONFIG['consul_url']
# START THE MAIN SERVICE
APP.run(host="0.0.0.0", port=ARGS['port'])

121
AoM_Service/aom_test.py Executable file
View File

@@ -0,0 +1,121 @@
import json
import time
import requests
import yaml
service_config = yaml.load(open('service.yaml', 'r').read())
kairos_url = service_config['kairosdb_url'] + "api/v1/datapoints/"
kairos_query = kairos_url + "query"
metrics_list = []
status1 = "RECOVERY"
status2 = "WARNING"
status3 = "CRITICAL"
json_string1 = """{"name": "aom_test_metric","datapoints": """
json_string2 = ""","tags": {"host": "aom_host","data_center": "AOM"},"ttl": 500}"""
# WRITE ALERT CONFIG FILE
alert_file = {'alerts': {'sensu': {'slack': 'aom_test_channel'}},
'critical_lower_threshold': 100,
'critical_upper_threshold': 5000,
'id': 'test_metric',
'interval': 30,
'occurrences_threshold': 1,
'query': {'cache_time': 0,
'end_relative': {'unit': 'seconds', 'value': '30'},
'metrics': [{'name': 'aom_test_metric', 'tags': {}}],
'start_relative': {'unit': 'seconds', 'value': '60'}},
'tags': {},
'url': 'AOM_TESTING',
'warning_lower_threshold': 1000,
'warning_upper_threshold': 2000}
query_intro = """{
"metrics": [
{
"tags": {
"alert": [
"test_metric"
]
},
"name": "telegraf.aom_"""
query_outro = """_value",
"aggregators": [
{
"name": "sum",
"align_sampling": true,
"sampling": {
"value": "9",
"unit": "minutes"
},
"align_start_time": false
}
]
}
],
"cache_time": 0,
"start_relative": {
"value": "8",
"unit": "minutes"
}
}"""
def main():
# noinspection PyBroadException
try:
with open('alert_configs/test.yaml', 'w') as yaml_file:
yaml.dump(alert_file, yaml_file, default_flow_style=False)
except Exception:
print("Error writing alert config file")
return False
now = int(time.time() * 1000)
metrics_list.append([now, 1501])
now += 32000
metrics_list.append([now, 202])
now += 32000
metrics_list.append([now, 23])
now += 32000
metrics_list.append([now, 1504])
now += 32000
metrics_list.append([now, 2005])
now += 32000
metrics_list.append([now, 5006])
now += 32000
metrics_list.append([now, 1507])
full_string = json_string1 + str(metrics_list) + json_string2
try:
ret = requests.post(kairos_url, data=json.dumps(json.loads(full_string)), timeout=200)
assert ret.status_code == 204, "Wrong status code received from KairosDB"
except AssertionError as e:
print("Error: {}".format(str(e)))
except Exception as e:
print("Problem talking to KairosDB: {}".format(str(e)))
return False
print("Metrics sent to KairosDB. Check alerts in the #aom_test_channel in Slack")
time.sleep(360)
try:
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status1 + query_outro)), timeout=200)
print("Recovery {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong RECOVERY result"
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status2 + query_outro)), timeout=200)
print("Warning {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong WARNING result"
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status3 + query_outro)), timeout=200)
print("Critical {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 4, "Wrong CRITICAL result"
except AssertionError as e:
print("Error: {}".format(str(e)))
except Exception as e:
print("Problem getting results from KairosDB: {}".format(str(e)))
return False
return True
if __name__ == '__main__':
main()

View File

View File

@@ -0,0 +1,66 @@
class Alert_Config():
def __init__(self, yaml_config) :
if not 'alert_tags' in yaml_config :
yaml_config['alert_tags'] = {}
self.id = str(yaml_config['id'])
self.yaml_config = yaml_config
self.tags = {}
self.state = {}
def type(self) :
if 'type' in self.yaml_config :
return self.yaml_config['type']
return 'kairos'
def tags(self) :
if 'tags' in self.yaml_config :
return self.yaml_config['tags']
return []
def occurrences(self) :
if 'occurrences_threshold' in self.yaml_config :
return self.yaml_config['occurrences_threshold']
return 1
def url(self) :
if 'url' in self.yaml_config :
return self.yaml_config['url']
from os import environ
return environ['AOM_GRAFANA_URL'] + self.id
def get_level(self, key) :
if not key in self.state :
self.state[key] = None
return self.state[key]
def set_level(self, key, value) :
self.state[key] = value
def get_for_tags(self, key) :
if not key in self.tags :
self.tags[key] = 0
return self.tags[key]
def set_for_tags(self, key, value) :
if not key in self.tags :
self.tags[key] = 0
self.tags[key] = value
def init_for_tags(self, key) :
for k in [key, key+"_count"] :
if not key in self.tags :
self.set_for_tags(key, 0)
self.set_for_tags(key+"_noresult", 0)
def get_threshold(isUpper, isWarning) :
if isUpper and isWarning :
return self.try_get_yaml_config('warning_upper_threshold')
if isUpper and not isWarning :
return self.try_get_yaml_config('critical_upper_threshold')
elif not isUpper and isWarning :
return self.try_get_yaml_config('warning_lower_threshold')
elif not isUpper and not isWarning :
return self.try_get_yaml_config('critical_lower_threshold')
def try_get_yaml_config(self, key) :
return self.yaml_config[key] if key in self.yaml_config else None, key in self.yaml_config

View File

@@ -0,0 +1,36 @@
from alert_config import Alert_Config
class Alert_Config_List() :
def __init__(self, alert_configs=None) :
self.hash = {}
if alert_configs :
self.add(alert_configs)
def __getitem__(self, k) :
return self.hash[k]
def __len__(self) :
return len(self.hash)
def add(self, alert_config) :
if isinstance(alert_config, Alert_Config):
self.hash[alert_config.id] = alert_config
elif isinstance(alert_config, list) :
for a in alert_config :
self.add(a)
elif isinstance(alert_config, Alert_Config_List) :
for k in alert_config.hash :
self.add(alert_config.hash[k])
else :
raise Exception("unexpected type added to Alert_Config_List")
def compare(self, other) :
if not other :
other = Alert_Config_List()
self_keys = self.hash.keys()
other_keys = other.hash.keys()
added = other_keys - self_keys
removed = self_keys - other_keys
intersection = [i for i in self_keys if i in other_keys]
modified = [ i for i in intersection if self[i] != other[i] ]
return set(added), set(removed), set(modified)

163
AoM_Service/library/args.py Executable file
View File

@@ -0,0 +1,163 @@
# Contians the arg parser options.
"""Contains the arg parser options."""
import argparse
import sys
def get_builder_args():
"""
Gets the arguments passed in to the aom_builder main call
:return: parser object
"""
parser = argparse.ArgumentParser(
description="Generates a valid yaml file "
"for alerting on metrics. If you are "
"familiar with the yaml structure for an "
"alert you don't have to use this builder,"
" it's just convenient")
parser.add_argument('-q', '--query', help="The Kariosdb query string to "
"use")
parser.add_argument(
'-i', '--interval', type=int, default=60, help="The "
"interval that the check will This value is in seconds")
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
"upper threshold is the value that when reached will "
"cause an depending on the threshold logic. "
"Use in conjunction with lower threshold to define a "
"normal band.")
parser.add_argument(
'-b',
'--lowerthreshold',
help="The lower threshold is the value that when reached will cause an "
"alert depending on the threshold logic"
"Use in conjunction with upper threshold to define a normal band.")
parser.add_argument(
'-m',
'--measure',
choices=[
'gt',
'lt',
'eq'],
help="The measure to use to compare the "
"threshold to the values of the alerts")
parser.add_argument(
'-a',
'--alert_config',
help='A valid Yaml representation of your alerting block')
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_builder run. "
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_tester_service_args():
"""
Gets arguments passed into aom_tester.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics dummy tester "
"service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def get_service_args():
"""
Gets arguments passed into aom_service.py
Returns: parser object
"""
parser = argparse.ArgumentParser(
description="Parameters to start the alerting on metrics service")
parser.add_argument(
'-l',
'--log_level',
type=int,
default=0,
help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument(
'-a',
'--alert_configs',
default=None,
help="If provided will override the folder location read from the "
"config with the value passed in. Is helpful for testing and "
"troubleshooting alerts")
parser.add_argument(
'--alert_routing_lookup',
default=None,
help="If provided will override the folder used to fetch the alerts "
"lookup configuration.")
parser.add_argument(
'-o',
'--override',
action='store_true',
help="Overrides the check leader election value")
parser.add_argument(
'--hostname',
help="If provided, will override the actual hostname check with this "
"value")
parser.add_argument(
'-p',
'--port',
type=int,
default=8080,
help="The port to run the webapp on")
return args_to_dict(parser)
def args_to_dict(parsed_args):
"""
Converts the argument parser object to a dict
Args:
parsed_args: Arg parser object
Returns:
Dictionary of arguments
"""
try:
arg_list = parsed_args.parse_args()
# RETURN A DICT OF ARGUMENTS
arg_dict = dict()
for val in vars(arg_list):
arg_dict[val] = getattr(arg_list, val)
return arg_dict
except argparse.ArgumentError:
parsed_args.print_help()
sys.exit(1)

226
AoM_Service/library/config.py Executable file
View File

@@ -0,0 +1,226 @@
# config.py
"""Functions for loading alert configuration files"""
import glob
import os
import json
import hashlib
import yaml
import requests
from serviceapp import service
# import logging
# logger = logging.getLogger(__name__)
def md5(fname):
"""Calculates md5 hash of a filename"""
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def get_healthy_nodes_and_index(consul_url, hostname, logger):
"""Find AOM healthy nodes on consult"""
try:
# getting all registered nodes from consul
r = requests.get(
consul_url +
'/v1/catalog/service/alert-on-metrics',
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
value = json.loads(r.text)
node_list = []
host_index = -1
for elem in value:
node_list.append(elem.get('Node'))
# Retrieving healthy nodes
healthy_nodes = []
for node in node_list:
r2 = requests.get(
consul_url +
'/v1/health/node/' +
node,
timeout=60)
assert r.status_code == 200, "Failed to get back a 200 from consul health"
healthcheck_list = json.loads(r2.text)
for check in healthcheck_list:
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
check.get('Status') == 'passing'):
healthy_nodes.append(node)
try:
healthy_nodes.sort()
host_index = healthy_nodes.index(hostname)
except ValueError:
logger.error("Host is not healthy")
except TimeoutError:
logger.error("Timed out connecting to Consul")
return host_index, len(healthy_nodes)
def distribute_configs(
filename,
host_index,
module,
logger):
"""Uses md5 of alert config to split the files among healthy servers"""
if module == 0:
logger.error("No healthy nodes for the service")
return False
if host_index == -1:
logger.error("Host is unhealthy")
return False
if int(md5(filename), 16) % module == host_index:
return True
return False
def is_valid(alert_config, logger):
"""Checks if alert has all required fields"""
try:
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
assert alert_config['query'], "No Query, this is a dead config"
assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
assert alert_config['id'], "Alert ID is empty, this is a dead config"
if alert_config.get('query_type') == 'prometheus':
assert isinstance(
alert_config['query'], str), "Invalid Prometheus query"
else:
assert isinstance(
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
{'', 'dc', 'fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in alert_config['query']['metrics'][0]:
defined_tags.update(
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
# AFTER CRITICAL
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'lookup' in alert_config['alerts']:
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
assert all(
isinstance(
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
# if 'occurrences_threshold' in alert_config:
# assert alert_config['occurrences_threshold'] >= 1, \
# "Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
logger.warning("Invalid config file: {}".format(str(e)))
return False
return True
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
"""Check if routing lookup is properly configured"""
try:
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
for alert_routing in alert_routing_lookup:
assert 'alert' in alert_routing, "No alert defined for this configuration."
assert 'tags' in alert_routing, "No tags value defined for this configuration."
for tag in alert_routing['tags']:
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
tag)
assert all(isinstance(tag, str)
for tag in alert_routing['tags']), "Tags must be valid string"
except AssertionError as e:
logger.warning("Invalid alert routing config file: {}".format(str(e)))
return False
return True
# noinspection PyBroadException
def glob_the_configs(
config_path,
lookup_config_path,
consul_url,
hostname,
logger):
"""
Args:
config_path (string): relative path to the configs
consul_url (string): url to consul service
logger:
Returns:
List of configs
"""
invalid_configs = 0
alert_list = []
host_index, module = get_healthy_nodes_and_index(
consul_url, hostname, logger)
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
if distribute_configs(
config_file,
host_index,
module,
logger):
try:
alert = yaml.safe_load(open(config_file, 'rb').read())
if is_valid(alert, logger):
if 'lookup' in alert['alerts']:
alert_routing_lookup = []
is_valid_lookup = True
if 'lookup_file' in alert['alerts']['lookup']:
lookup_path = "{}/{}".format(
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
if os.path.isfile(lookup_path):
alert_routing_lookup = yaml.safe_load(
open(lookup_path, 'rb').read())
else:
is_valid_lookup = False
else:
alert_routing_lookup = alert['alerts']['lookup']['lookups']
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
alert_routing_lookup, alert, logger)
if is_valid_lookup:
alerts_per_tags = {}
for alert_configuration in alert_routing_lookup:
key = []
for tag in alert['alerts']['lookup']['tags']:
key.append(
alert_configuration['tags'].get(tag))
alerts_per_tags[tuple(
key)] = alert_configuration['alert']
alert['alert_routing_lookup'] = alerts_per_tags
else:
invalid_configs += 1
continue
alert_list.append(alert)
else:
invalid_configs += 1
except BaseException as e:
logger.error("Error parsing {} config: {}".format(config_file, e))
logger.info("Invalid configs: {}".format(invalid_configs))
service.send_stat(
'invalid_configs',
invalid_configs,
dict(),
statprefix='aom')
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list

10
AoM_Service/library/job.py Executable file
View File

@@ -0,0 +1,10 @@
import subprocess
class Job() :
def __init__(self, id, p):
self.id = id
self.p = p
def kill(self) :
subprocess.call(["/bin/kill", "-9", "{}".format(self.p.pid)])
self.p.join()

29
AoM_Service/library/job_list.py Executable file
View File

@@ -0,0 +1,29 @@
from job import Job
class Job_List() :
def __init__(self) :
self.jobs = {}
def __getitem__(self, k) :
return self.jobs[k]
def __setitem__(self, k, v) :
self.jobs[k] = v
def __len__(self) :
return len(self.jobs)
def add(self, job) :
if isinstance(job, Job) :
self[job.id] = job
elif isinstance(job, Job_List) :
for j in job.jobs :
self.add(job[j])
else :
raise Exception("unexpected type added to Job_List")
def kill(self, id) :
if not id in self.jobs :
return
self[id].kill()
del(self.jobs[id])

122
AoM_Service/library/logger.py Executable file
View File

@@ -0,0 +1,122 @@
# logger.py
""" Logging configuration """
import logging
import logging.handlers
import os
logging.getLogger('requests').setLevel(logging.ERROR)
logging.getLogger('urllib3').setLevel(logging.ERROR)
logging.getLogger('werkzeug').setLevel(logging.ERROR)
class SingleLevelFilter(logging.Filter):
def __init__(self, passlevel, reject):
"""
initilizer(constructor) of the singlelevelfilter
@param passlevel (int) - the int value of the level of the log
@param reject (bool) - if true will return if the record level is
not equal to the passlevel
@return SingleLevelFilter object
@note Sets some object parameters
"""
self.passlevel = passlevel
self.reject = reject
def filter(self, record):
"""
Returns True/False depending on parameters
@param record (Log int) - the record that the filter belongs to
@return bool - True/False depending on what self.reject is set to and
what record.levelno and self.passlevel are set to
@note This causes either only logging of the exact same level to get
logged, or only logging other than the same level to get logged
"""
if self.reject:
return record.levelno != self.passlevel
return record.levelno == self.passlevel
class AlertLogging(logging.Logger):
"""
Class Object to handle the logging of the alert on metrics service
starts at Error level and can flip on (and add) an additional log file and
Debug logger as needed.
"""
def __init__(self, name):
"""
Inits the formaters and logger
"""
self.name = name
self.debug_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
"%(message)s", "%m-%d %H:%M:%S")
self.standard_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - %(message)s", "%m-%d %H:%M:%S")
logging.getLogger()
logging.Logger.__init__(self, name, logging.DEBUG)
logging.setLoggerClass(AlertLogging)
def start(self):
"""
Returns:
"""
info_handler = logging.StreamHandler()
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(self.standard_formatter)
self.addHandler(info_handler)
return self
def start_log_file(self, file_path, mode='a'):
"""
Creates a separate log file handler
Args:
file_path: path to the log file
mode: the type of mode to open the file handler with
Returns:
"""
self.log_path = file_path
work_folder = os.path.dirname(file_path)
if work_folder and not os.path.exists(work_folder):
os.makedirs(work_folder)
self.log_handler = logging.FileHandler(file_path, mode)
self.log_handler.setLevel(logging.WARNING)
self.log_handler.setFormatter(self.debug_formatter)
self.addHandler(self.log_handler)
def stop_log_file(self):
"""
Closes Log file and sets the handler to None
Returns:
"""
self.log_handler.close()
self.removeHandler(self.log_handler)
self.log_handler = None
def start_debug(self):
"""
Returns:
"""
self.debug_handler = logging.StreamHandler()
self.debug_handler.setLevel(logging.DEBUG)
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
self.debug_handler.setFormatter(self.debug_formatter)
self.addHandler(self.debug_handler)
def stop_debug(self):
"""
stop the debugger
Returns:
"""
self.removeHandler(self.debug_handler)
self.debug_handler = None

14
AoM_Service/library/process.py Executable file
View File

@@ -0,0 +1,14 @@
import multiprocessing
class Process(multiprocessing.Process) :
def __init__(self, alert_config, config, logger, production_mode) :
multiprocessing.Process.__init__(
self,
target=self.get_target(),
args=(alert_config, config, logger, production_mode),
name=alert_config.id,
daemon=True,
)
def get_target(self) :
raise Exception("abstract method not implemented")

View File

@@ -0,0 +1,14 @@
import process_prometheus
import process_kairos
class Process_Factory() :
def __init__(self, config, logger, production) :
self.config = config
self.logger = logger
self.production = production
def build(self, alert_config) :
if alert_config.type() == "prometheus" :
return process_prometheus.Process_Prometheus(alert_config, self.config, self.logger, self.production)
else:
return process_kairos.Process_Kairos(alert_config, self.config, self.logger, self.production)

View File

@@ -0,0 +1,6 @@
import process
from serviceapp import service
class Process_Kairos(process.Process) :
def get_target(self) :
return service.check_kairosdb_alert

View File

@@ -0,0 +1,6 @@
import process
from serviceapp import service
class Process_Prometheus(process.Process) :
def get_target(self) :
return service.check_prometheus_alert

80
AoM_Service/library/service.py Executable file
View File

@@ -0,0 +1,80 @@
import os
from alert_config_list import Alert_Config_List
from alert_config import Alert_Config
from job_list import Job_List
from job import Job
from process_factory import Process_Factory
from time import sleep
from config import glob_the_configs
from serviceapp import service
class Service() :
def __init__(self, logger, reload_interval, hostname, config):
self.alert_config_list = Alert_Config_List()
self.job_list = Job_List()
self.logger = logger
self.info = self.logger.info
self.error = self.logger.error
self.reload_interval = reload_interval
self.box_hostname = os.environ['HOSTNAME'] if hostname is None else hostname
self.production = not "TEST" in os.environ
self.config = config
def start(self) :
self.info("Waiting 15s for Consul service to pass")
sleep(15)
while self.is_running() :
new_alert_config_list = self.get_new_alert_config_list()
self.purge_stale(new_alert_config_list)
self.create_upserted(new_alert_config_list)
self.alert_config_list = new_alert_config_list
total_jobs = len(self.job_list)
self.info("Total running jobs: {}".format(total_jobs))
service.send_stat('total_jobs', total_jobs, dict(), statprefix='aom')
sleep(self.reload_interval)
self.info("Exiting alerts")
self.purge_stale(Alert_Config_List())
def is_running(self) :
return True
def get_new_alert_config_list(self) :
try :
yaml_configs = self.parse_alert_config_files()
alert_configs = [Alert_Config(i) for i in yaml_configs]
return Alert_Config_List(alert_configs)
except Exception as e :
self.error("Failed to load config files: {}".format(e))
return []
def parse_alert_config_files(self) :
path = self.config['alert_folder']
routing = self.config['alert_routing_config']
consul = 'http://consul.service.consul:8500'
return glob_the_configs(path, routing, consul, self.box_hostname, self.logger)
def purge_stale(self, new_alert_config_list) :
_, removed_ids, modified_ids = self.alert_config_list.compare(new_alert_config_list)
stale_ids = removed_ids.union(modified_ids)
for stale_id in stale_ids :
self.job_list.kill(stale_id)
service.send_stat('removed_jobs', len(removed_ids), dict(), statprefix='aom')
self.info("Removed alert_configs: {}".format(removed_ids))
def create_upserted(self, new_alert_config_list) :
added_ids, _, modified_ids = self.alert_config_list.compare(new_alert_config_list)
upserted_ids = added_ids.union(modified_ids)
for id in upserted_ids :
p = self.spawn_process(new_alert_config_list[id])
j = Job(id, p)
self.job_list.add(j)
service.send_stat('new_jobs', len(added_ids), dict(), statprefix='aom')
service.send_stat('modified_jobs', len(modified_ids), dict(), statprefix='aom')
self.info("Added alert_configs: {}".format(added_ids))
self.info("Modified alert_configs: {}".format(added_ids))
def spawn_process(self, alert_config) :
process_factory = Process_Factory(self.config, self.logger, self.production)
process = process_factory.build(alert_config)
process.start()
return process

View File

View File

@@ -0,0 +1,189 @@
from thresholds import Thresholds
class Alert() :
def __init__(self, alert_config, logger, tags, result, min_value, max_value) :
self.occurrences_breached = False
self.new_level_breached = False
self.info = logger.info
self.debug = logger.debug
self.warning = logger.warning
self.error = logger.error
self.alert_config = alert_config
self.thresholds = Thresholds(alert_config)
self.tags = ""
self.result = result
self.set_tags(tags)
self.alert_config.init_for_tags(alert_config.get_tags())
self.set_firing(min_value, max_value)
if availability :
self.info("Sending availability stat 1")
self.send_metrics(self.name(), 0 if self.level() == "CRITICAL" else 1, self.result, 'service_level')
def name(self) :
return "Metric: {} for {}".format(self.alert_config.id, self.get_tags())
def body(self) :
body = ""
if not self.get_firing() :
body = self.get_not_firing_body()
else :
body = self.get_is_firing_body()
self.debug("Alert {}->[{}]->{}, Occurrences={} of {}".format(
self.name(),
self.get_tags(),
self.level(),
self.get_occurrences(),
self.alert_config.occurrences(),
))
self.send_metrics(self.name(), self.level_code(), self.level())
# TODO
return body, md5(tag.encode('utf-8')).hexdigest()[:10]
def level(self) :
if not self.get_firing() :
return "RECOVERY"
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.CRITICAL)] :
return "CRITICAL"
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.WARNING)] :
return "WARNING"
def level_code(self) :
level = self.level()
if level == "RECOVERY" :
return 0
elif level == "WARNING" :
return 0
elif level == "CRITICAL" :
return 0
def get_not_firing_body(self) :
body = ""
body += get_not_firing_body_threshold()
body += get_not_firing_body_occurrences()
if not body :
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
return ""
return "GOOD: " + body
def get_not_firing_body_threshold(self) :
if self.result is None :
return ""
body = ""
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=True)
if not ok :
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=False)
if ok :
body += self.form("<", v)
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=True)
if not ok :
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=False)
if ok :
body += self.form(">", v)
return body
def get_not_firing_body_occurrences(self) :
if not self.get_occurrences() :
return ""
body = ""
if not self.result is None :
self.send_metrics(self.name(), 1, self.level())
else :
body += "{} RECOVERY due to no results found from query. Recommend you manually validate recovery\n{}".format(self.name(), self.alert_config.url())
self.set_occurrences(force=0)
return body
def get_is_firing_body(self) :
body = ""
if self.thresholds.get_breached(level=Thresholds.UPPER) :
body += self.form(">", self.upper_firing)
if self.thresholds.get_breached(level=Thresholds.LOWER) :
body += self.form("<", self.upper_firing)
if self.occurrences_breached :
self.debug("Value {} of {} for tag {} has occurred {} time(s) < threshold of {}".format(
self.value,
self.name(),
self.get_tags(),
self.get_occurrences(),
self.alert_config.occurrences(),
))
return ""
return body
def form(self, operator, static) :
return "{}\n{:.2f} {}= {}\n{}".format(
self.name(),
self.value,
operator,
static,
self.alert_config.url(),
)
def set_tags(self, tags) :
if tags :
self.tags = tags
elif self.result :
import itertools
result_tags = [ self.result['tags'][x] for x in self.alert_config.get_tags() ]
chain = itertools.chain(result_tags)
sorted_list = sorted(list(chain))
self.tags = ", ".join(sorted_list)
if not self.tags :
self.tags = "instance"
def get_tags(self) :
return self.tags
def set_firing(self, min_value, max_value) :
self.thresholds = Thresholds(self.alert_config)
self.thresholds.set_breached(min_value, max_value)
self.set_occurrences()
self.set_new_level_breached()
self.send_metrics()
self.send_threshold_metrics()
def get_firing(self) :
return self.thresholds.get_breached() and self.occurrences_breached
def get_occurrences(self) :
tags = self.get_tags()
return self.alert_config.get_for_tags(tags)
def set_occurrences(self, force=None) :
previous_occurrences = self.get_occurrences()
if self.thresholds.get_breached() :
new_occurrences = previous_occurrences+1
self.alert_config.set_for_tags(self.get_tags(), new_occurrences)
self.occurrences_breached = self.alert_config.occurrences() <= new_occurrences
if force :
self.alert_config.set_for_tags(self.get_tags(), force)
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
def send_metrics(self, *args, **kwargs) :
print("send_metrics not impl")
def set_new_level_breached(self) :
key = self.get_tags()
level = self.level()
previous_level = self.alert_config.get_level(key)
self.new_level_breached = level != previous_level
self.alert_config.set_level(key, level)
self.info("testInfo: {} {}".format(
"NEW" if self.new_level_breached else "EXISTING",
self.level(),
))
def get_new_level_breached(self) :
return self.new_level_breached
def send_threshold_metrics(self) :
# TODO
self.send_metrics(self.alert_config.id, self.value)
for level in [Thresholds.WARNING, Thresholds.CRITICAL] :
for end in [Thresholds.UPPER, Thresholds.LOWER] :
v, ok = self.alert_config.get_threshold(isUpper=level == Thresholds.UPPER, isWarning=end == Thresholds.WARNING)
if ok :
key = "{}_{}_threshold".format(
"upper" if level == Thresholds.UPPER else "lower",
"warning" if level == Thresholds.WARNING else "critical",
)
self.send_stat(key, v, {'id':self.name()})

View File

@@ -0,0 +1,13 @@
from alert import Alert
class Alert_Factory() :
def __init__(self, alert_config, logger) :
self.alert_config = alert_config
self.logger = logger
self.info = logger.info
self.warning = logger.warning
self.debug = logger.debug
self.error = logger.error
def build(self, minvalue, maxvalue, result, tags, availability, alert_tags) :
return Alert(self.alert_config, tags, result, minvalue, maxvalue)

View File

@@ -0,0 +1,83 @@
from datetime import datetime, timedelta
from urllib.parse import urljoin
import requests
class PromAPI:
def __init__(self, endpoint='http://127.0.0.1:9090/'):
"""
:param endpoint: address of
"""
self.endpoint = endpoint
@staticmethod
def _to_timestamp(input_):
"""
Convert string input to UNIX timestamp for Prometheus
:param input_:
:return:
"""
if type(input_) == datetime:
return input_.timestamp()
if input_ == 'now':
return datetime.utcnow().isoformat('T')
if type(input_) is str:
input_ = float(input_)
if type(input_) in [int, float]:
if input_ > 0:
return input_
if input_ == 0: # return now
return datetime.utcnow().isoformat('T')
if input_ < 0:
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
#assert type(input_) == float
def query(self, query='prometheus_build_info'):
return self._get(
uri='/api/v1/query',
params=dict(
query=query
)
)
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
"""Get ser"""
params = {
'query': query
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
if duration:
params['step'] = duration
print(params)
return self._get(
uri='/api/v1/query_range',
params=params
)
def series(self, match='prometheus_build_info', start=-86400, end='now'):
"""Get ser"""
params = {
'match[]': match
}
if end is not None:
params['end'] = self._to_timestamp(end) + 'Z'
if start:
params['start'] = self._to_timestamp(start) + 'Z'
print(params)
return self._get(
uri='/api/v1/series',
params=params
)
def _get(self, uri, params, method='GET'):
url = urljoin(self.endpoint, uri)
assert method == 'GET'
result = requests.get(
url=url,
params=params
)
return result.json()

View File

@@ -0,0 +1,949 @@
""" Alert On Metrics functions"""
import copy
import itertools
import json
import os
import random
import smtplib
from email.mime.text import MIMEText
from socket import gaierror
from time import sleep
from hashlib import md5
import requests
from statsd import StatsClient
from serviceapp.prom_api import PromAPI
alert_status = [
'RECOVERY',
'WARNING',
'WARNING',
'CRITICAL',
'CRITICAL',
'CRITICAL']
def build_alert_message(alert, minvalue, maxvalue, result, logger,
availability, tag=None, alert_tags=None):
"""
Build the alert message
Args:
alert: the alert object that includes a tag definition
minvalue: the min value to test against the threshold
maxvalue: the max value to test against the threshold
result: the response back from kairosdb
logger (log object): does the logging
availability: Send availability stat 1
tag: If passed in will use this value for the tag instead of
getting it from the result object
alert_tags: the tags corresponding to the result, used if an
alert has to be triggered and a custom routing per tag is configured
Returns:
Alert message string
"""
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
# MAY CHANGE THIS.
# value = maxvalue
# # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
# # (USUALLY A GLOBAL ALL-DC QUERY)
# if tag is None and result is not None:
# tag = ', '.join(sorted(list(itertools.chain(
# *[result['tags'][x] for x in alert['tags']]))))
# tag_count = tag + "_count"
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
# RETURNING RESULTS
# tag_noresult = tag + "_noresult"
# if not tag:
# tag = 'instance'
# logger.debug("No tag specified for alert {}".format(alert['id']))
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
# if 'alert_tags' not in alert:
# alert['alert_tags'] = {}
# if tag not in alert['alert_tags']:
# alert['alert_tags'][tag] = 0
# if tag_count not in alert['alert_tags']:
# alert['alert_tags'][tag_count] = 0
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
# CLEARING EVERYTHING OUT ANYWAY
# alert['alert_tags'][tag_noresult] = 0
# # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
# upper_critical_threshold = None
# upper_warning_threshold = None
# lower_warning_threshold = None
# lower_critical_threshold = None
# upper_threshold = None
# lower_threshold = None
# is_warning_alarm = False
# is_critical_alarm = False
# # UPPER
# upper_threshold_exists = False
# upper_warning_threshold_breached = False
# upper_critical_threshold_breached = False
# if 'warning_upper_threshold' in alert:
# upper_threshold_exists = True
# upper_warning_threshold = alert['warning_upper_threshold']
# upper_threshold = upper_warning_threshold
# if maxvalue >= upper_warning_threshold:
# upper_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_upper_threshold' in alert:
# upper_critical_threshold = alert['critical_upper_threshold']
# if not upper_threshold_exists:
# upper_threshold = upper_critical_threshold
# upper_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
# # OUR THRESHOLD FOR ALERTING
# if maxvalue >= alert['critical_upper_threshold']:
# upper_threshold = upper_critical_threshold
# upper_critical_threshold_breached = True
# is_critical_alarm = True
# upper_threshold_breached = (upper_warning_threshold_breached
# or upper_critical_threshold_breached)
# # LOWER
# lower_threshold_exists = False
# lower_warning_threshold_breached = False
# lower_critical_threshold_breached = False
# if 'warning_lower_threshold' in alert:
# lower_threshold_exists = True
# lower_warning_threshold = alert['warning_lower_threshold']
# lower_threshold = lower_warning_threshold
# if minvalue <= lower_warning_threshold:
# lower_warning_threshold_breached = True
# is_warning_alarm = True
# if 'critical_lower_threshold' in alert:
# lower_critical_threshold = alert['critical_lower_threshold']
# if not lower_threshold_exists:
# lower_threshold = lower_critical_threshold
# lower_threshold_exists = True
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
# # OUR THRESHOLD FOR ALERTING
# if minvalue <= lower_critical_threshold:
# lower_threshold = lower_critical_threshold
# lower_critical_threshold_breached = True
# is_critical_alarm = True
# lower_threshold_breached = (lower_warning_threshold_breached or
# lower_critical_threshold_breached)
# # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
# if lower_threshold is None and upper_threshold is None:
# logger.debug(
# "ERROR: alert {} does not have any thresholds set on {}".format(
# alert['id'], tag))
# # ON TO OCCURRENCES
# if 'occurrences_threshold' in alert:
# occurrences_threshold = alert['occurrences_threshold']
# else:
# occurrences_threshold = 1
# alert_entity = "Metric: {} for {}".format(alert['id'], tag)
# if 'url' not in alert:
# alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
# ====================
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
# ====================
# alert_body = ''
# if upper_threshold_breached:
# alert_body = "{}\n{:.2f} >= {}\n{}".format(
# alert_entity, value, upper_threshold, alert['url'])
# if lower_threshold_breached:
# value = minvalue
# alert_body = "{}\n{:.2f} <= {}\n{}".format(
# alert_entity, value, lower_threshold, alert['url'])
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
### BREEL TODO ###
# if result is not None:
# send_metrics(alert, value, result)
# if 'critical_upper_threshold' in alert:
# send_stat('upper_critical_threshold', upper_critical_threshold,
# {'id': alert['id']})
# if 'warning_upper_threshold' in alert:
# send_stat('upper_warning_threshold', upper_warning_threshold,
# {'id': alert['id']})
# if 'critical_lower_threshold' in alert:
# send_stat('lower_critical_threshold', lower_critical_threshold,
# {'id': alert['id']})
# if 'warning_lower_threshold' in alert:
# send_stat('lower_warning_threshold', lower_warning_threshold,
# {'id': alert['id']})
# ====================
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
# ====================
#current_alert_status = alert_status[0]
#if not lower_threshold_breached and not upper_threshold_breached:
# # if result is not None:
# # if lower_threshold_exists and not upper_threshold_exists:
# # alert_body = "{}\n{:.2f} > {}\n{}".format(
# # alert_entity, value, lower_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is higher than lower threshold {}"
# # "for value {} on tag {}".format(
# # alert['id'], lower_threshold, value, tag))
# # if upper_threshold_exists and not lower_threshold_exists:
# # alert_body = "{}\n{:.2f} < {}\n{}".format(
# # alert_entity, value, upper_threshold, alert['url'])
# # logger.debug("GOOD: alert {} is below the upper threshold {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, value, tag))
# # if upper_threshold_exists and lower_threshold_exists:
# # alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
# # alert_entity, lower_threshold, value, upper_threshold,
# # alert['url'])
# # logger.debug("GOOD: alert {} is between thresholds {} and {} "
# # "for value {} on tag {}".format(
# # alert['id'], upper_threshold, lower_threshold,
# # value, tag))
# # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
# # STATE
# #if alert['alert_tags'][tag] > 0:
# # if result is not None:
# # send_metrics(alert, 1, result, current_alert_status)
# # logger.info(
# # "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
# # alert['id'], tag))
# # if result is None:
# # alert_body = ("{} RECOVERY due to no results found from "
# # "KairosDB query. Recommend you manually validate"
# # "recovery.\n{}").format(
# # alert_entity, alert['url'])
# # alert['alert_tags'][tag] = 0
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# #else:
# # # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # # CRITICAL) NEEDS TO BE FIRED
# # alert['alert_tags'][tag_count] = 0
# # if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# # return None
#else:
### BREEL WORKING HERE ###
# ====================
# SET KEY / VALUE FOR TAG ON ALERT
# 0 == No Alert
# 1 == Warning
# 2 == Existing Warning Alert
# 3 == New Critical
# 4+ == Existing Critical Alert
# ====================
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
# alert['alert_tags'][tag_count] += 1
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
# OCCURRENCES SO RETURN IT
# TODO this doesnt belog in Alert.py
#if alert['alert_tags'][tag_count] >= occurrences_threshold:
# # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
# if alert['alert_tags'][tag] < 4:
# if is_warning_alarm and not is_critical_alarm:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
# if alert['alert_tags'][tag] == 0:
# # NEW WARNING
# alert['alert_tags'][tag] = 1
# logger.info("TestInfo: WARNING (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING WARNING
# alert['alert_tags'][tag] = 2
# logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
# alert['id'], tag))
# if is_critical_alarm:
# # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
# if (alert['alert_tags'][tag] == 1 or
# alert['alert_tags'][tag] == 2):
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
# alert['id'], tag))
# else:
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
# # LEVEL
# if alert['alert_tags'][tag] < 3:
# # NEW CRITICAL
# alert['alert_tags'][tag] = 3
# logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
# alert['id'], tag))
# else:
# # EXISTING CRITICAL
# alert['alert_tags'][tag] = 4
# logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
# alert['id'], tag))
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
# EVEN IF NOT ACTIVELY ALERTING ON IT
# #if is_critical_alarm:
# #current_alert_status = alert_status[3]
# #send_metrics(alert, 2, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 0")
# # send_metrics(alert, 0, result, 'service_level')
# #if is_warning_alarm and not is_critical_alarm:
# #current_alert_status = alert_status[1]
# #send_metrics(alert, 1, result, current_alert_status)
# #if availability:
# # logger.info("Sending availability stat 1")
# # send_metrics(alert, 1, result, 'service_level')
# logger.debug("{} alert for value {} of {} for tag {} has occurred "
# "{} times. Threshold is >= {} times.".format(
# current_alert_status,
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# else:
# # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
# # CRITICAL) NEEDS TO BE FIRED
# logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
# "threshold of {}".format(
# value,
# alert['id'],
# tag,
# alert['alert_tags'][tag_count],
# occurrences_threshold))
# if availability:
# logger.info("Sending availability stat")
# send_metrics(alert, 1, result, 'service_level')
# return None
#logger.debug(
# "Alert {}->[{}]->{}, Occurrences={}".format(
# alert['id'], tag, current_alert_status,
# alert['alert_tags'][tag_count]))
#return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
def check_kairosdb_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
availability = False
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due Grafana limitations
if 'availability' in alert_config and alert_config['availability']:
availability = True
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
query_url = os.path.join(
service_config['kairosdb_url'] +
"api/v1/datapoints/query")
ret = requests.post(
query_url,
data=json.dumps(
alert_config['query']),
timeout=service_config['timeout'])
assert ret.status_code == 200
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret.json()['queries'][0]['results']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r)
if has_custom_alert_routing(alert_config) else None)
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
minvalue = min([x[1] for x in r['values']])
maxvalue = max([x[1] for x in r['values']])
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
minvalue,
maxvalue,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
# A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert for: {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error("Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"KairsoDB query failed: {}\n"
"HTTP status code:\t{}\n"
"Error Message:\t{}\nQuery:\n"
"{}".format(
ret.url,
ret.status_code,
ret.text,
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
def check_prometheus_alert(
alert_config,
service_config,
logger,
production_mode=True):
"""
Args:
alert_config (dict): Config of the alert to run
service_config (dict): Holds things like urls, tokens and other things
logger (log object): does the logging
Returns:
None
"""
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
# START AT THE SAME TIME
wait_time = random.randint(0, alert_config['interval'])
logger.info(
"ALERT_CONFIG: {}\tsleep: {}".format(
alert_config['id'],
wait_time))
sleep(wait_time)
# For metrics with availability set to true, we default the interval to 5
# mins due to Grafana limitations
availability = bool(alert_config.get('availability'))
# ====================
# EACH CHECK JUST LOOPS
# ====================
ret = None
while True:
try:
send_stat("check_run", 1, {'id': alert_config['id']})
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
ret = prom_api.query_range(
query=alert_config['query'],
start=alert_config['start_time'],
end=alert_config['end_time'],
duration=alert_config['interval'])
assert ret['status'] == 'success'
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
results = ret['data']['result']
logger.debug(
"Got back {} results for alert {}".format(
len(results), alert_config['id']))
log_alert_results(results, alert_config, logger)
alert_list = []
# LOOP THROUGH ALL THE RESULTS
for r in results:
alert_tags = (get_alert_tags(alert_config, r) if
has_custom_alert_routing(alert_config) else None)
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
r['tags'] = {key: [value]
for (key, value) in r['metric'].items()}
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
# THEREIN AND EXAMINE FOR FAILURE
if r['values']:
raw_values = [value for _, value in r['values']]
min_value = float(min(raw_values))
max_value = float(max(raw_values))
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
# AN OBJECT
alert_list.append(
build_alert_message(
alert_config,
min_value,
max_value,
r,
logger,
availability,
alert_tags=alert_tags))
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
elif 'alert_tags' in alert_config:
for key in alert_config['alert_tags']:
if ('count' not in key and 'noresult' not in key and
alert_config['alert_tags'][key] > 0):
key_noresult = key + "_noresult"
key_count = key + "_count"
if alert_config['alert_tags'][key_noresult] > 10:
logger.info("{} occurrences of no results back "
"for {}, clear out counts for tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
alert_list.append(
build_alert_message(
alert_config,
0,
0,
None,
logger,
availability,
key,
alert_tags=alert_tags))
alert_config['alert_tags'][key] = 0
alert_config['alert_tags'][key_count] = 0
alert_config['alert_tags'][key_noresult] = 0
else:
alert_config['alert_tags'][key_noresult] += 1
logger.info("{} occurrences of no results back "
"for {}, tag '{}'".format(
alert_config['alert_tags'][key_noresult],
alert_config['id'], key))
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
for alert in [x for x in alert_list if x is not None]:
if production_mode:
send_alerts(
alert,
copy.deepcopy(alert_config),
service_config['victorops_url'],
service_config['slack_url'],
service_config['slack_token'],
service_config['smtp_server'],
service_config['sensu_endpoint'],
service_config['uchiwa_url'],
logger)
else:
logger.info(
"Sending alert {}".format(
alert_config.get('id')))
# HANDLE THE UNEXPECTED
except TimeoutError:
logger.error(
"Query [{}] took to long to run".format(
alert_config['id']))
except AssertionError:
logger.error(
"Prometheus query failed:\n"
"Status:\t{}\n"
"Error Type:\t{}\n"
"Error Message:\t{}\n"
"Query:\n{}".format(
ret['status'],
ret['errorType'],
ret['error'],
alert_config['query']))
except gaierror:
logger.error(
"Unable to connect to smtp server: {}".format(
service_config['smtp_server']))
except Exception as e:
logger.error(
"Unhandled exception {} on alert: {}".format(
str(e), alert_config['id']))
finally:
sleep(alert_config['interval'])
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
def log_alert_results(results, alert_config, logger):
"""
Logs the results broken out by tag provided in the alert_config to the
logger for debuging
Args:
results: the results object returned from the call to kairosdb, of just
the results
alert_config: config object of the alert
logger (log object): does the logging
Returns:
None, logs to logger
"""
for v in results:
logger.debug("{} - Result: {}".format(alert_config['id'], v))
def send_alerts(
alert,
alert_config,
victorops_url,
slack_url,
slack_token,
smtp_server,
sensu_endpoint,
uchiwa_url,
logger):
"""
Sends out the alerts to VO, Email, and/or Slack
Args:
alert: the alert tuple:
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
alert_config: the alert configuration object
victorops_url: url to victorops
slack_url: url to slack api calls
slack_token: the token for the alert
smtp_server: The server to send mail messages too
sensu_endpoint:
uchiwa_url:
logger (log object): does the logging
Returns: None
"""
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
# USED
tag_dict = dict()
tag_dict['alert'] = alert_config['id']
is_custom_alert_routing = has_custom_alert_routing(alert_config)
if is_custom_alert_routing:
alert_routing = alert_config.get('alert_routing_lookup', {})
alert_config['alerts'] = alert_routing.get(
alert[3], alert_config['alerts']['lookup']['default'])
# once we move all alerts into sensu, we dont need to tho this
if 'filters' in alert_config:
logger.info(
"alert_status : {}, alert_config: {}".format(
alert[2], alert_config))
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
1, 2) and alert_config['filters']['slack_subdue']:
# unless the alert is critical we dont send it
logger.info("Removed slack, alert_config: {}".format(alert_config))
alert_config['alerts'].pop('slack', None)
if ('victorops_subdue' in alert_config['filters'] and
alert[2] in (1, 2) and
alert_config['filters']['victorops_subdue']):
# unless the alert is critical we dont send it
alert_config['alerts'].pop('vo', None)
logger.info("Removed vo, alert_config: {}".format(alert_config))
# ====================
# VICTOROPS HANDLING
# ====================
if 'vo' in alert_config['alerts']:
for notify in alert_config['alerts']['vo']:
payload = dict(entity_id=alert[0],
message_type=alert_status[alert[2]],
state_message=alert[1])
r = None
try:
r = requests.post(
victorops_url + notify,
data=json.dumps(payload),
headers={
"Content-type": "application-json"})
assert r.status_code == 200
# Record a VO alert sent event
tag_dict['alert_channel_type'] = "VictorOps"
tag_dict['who'] = "vo:{}".format(notify)
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except AssertionError:
logger.error(
"Post to VO failed for {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to VO: {}".format(
alert_config['id'], str(e)))
# ====================
# EMAIL HANDLING
# ====================
if 'email' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
msg = MIMEText(alert[1])
msg['Subject'] = '{} Status: {}'.format(
alert[0], alert_status[alert[2]])
msg['From'] = 'aom@qualtrics.com'
msg['To'] = ','.join(
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
try:
s = smtplib.SMTP(smtp_server)
s.send_message(msg)
s.quit()
# Record an Email alert sent event
tag_dict['alert_channel_type'] = "Email"
tag_dict['who'] = "email:{}".format(msg['To'])
send_stat("alert_channel", 1, tag_dict)
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
except Exception as e:
logger.error(
"Unhandled exception when sending mail for {} to {}\n{}".format(
alert_config['id'], smtp_server, str(e)))
# ====================
# SENSU HANDLING
# ====================
if 'sensu' in alert_config['alerts']:
# Dictionary with static values for Sensu
sensu_dict = {
'source': 'AOM',
'refresh': 3600,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4]}
# if alert[3]:
# logger.info(alert)
# sensu_dict['name'] = '_'.join(
# [alert_config['id']] + sorted(list(alert[3])))
if 'refresh' in alert_config:
sensu_dict['refresh'] = alert_config['refresh']
sensu_dict['interval'] = alert_config['interval']
sensu_dict['handlers'] = []
sensu_dict['dashboard'] = alert_config['url']
if 'dependencies' in alert_config['alerts']['sensu'].keys():
sensu_dict['dependencies'] = (alert_config['alerts']
['sensu']['dependencies'])
if 'victorops' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("victorops")
sensu_dict['routing_key'] = (alert_config['alerts']
['sensu']['victorops'])
# # Leave this here until we have email support in Sensu
# if 'email' in alert_config['alerts']['sensu'].keys():
# sensu_dict['handlers'].append("email")
# # verify this option
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
if 'slack' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("slack")
sensu_dict['slack_channel'] = (
alert_config['alerts']['sensu']['slack'])
# Format alert message
sensu_dict['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
if 'jira' in alert_config['alerts']['sensu'].keys():
sensu_dict['handlers'].append("jira")
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
if 'filters' in alert_config:
sensu_dict['filters'] = alert_config['filters']
# 0 = OK, 1 = WARNING, 2 = CRITICAL
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict['status'] = sensu_status[alert[2]]
sensu_dict['output'] = alert[1]
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'],
r.status_code,
r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} "
"when posting to Sensu: {}".format(
alert_config['id'], str(e)))
# ====================
# SLACK HANDLING - all Slack alerts will go through Sensu
# ====================
if 'slack' in alert_config['alerts'] and (
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
refresh = alert_config.get('refresh', 3600)
dashboard = alert_config.get('url', '')
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
sensu_dict2 = {'handlers': ['slack'],
'interval': alert_config['interval'],
'source': 'AOM',
'refresh': refresh,
'occurrences': 1,
'name': alert_config['id']+'__'+alert[4],
'dashboard': dashboard,
'status': sensu_status[alert[2]],
'output': alert[1]}
if is_custom_alert_routing:
sensu_dict2['name'] = '_'.join(
[alert_config['id']] + list(alert[3]))
sensu_dict2['dashboard'] = (
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
alert_config['url'], uchiwa_url, alert_config['id']))
for channel in alert_config['alerts']['slack']:
sensu_dict2['slack_channel'] = channel
r = None
try:
user = os.environ['API_USER']
passwd = os.environ['API_PASS']
r = requests.post(
sensu_endpoint,
json.dumps(sensu_dict2),
auth=(
user,
passwd))
assert r.status_code == 202
except AssertionError:
logger.error(
"Post to Sensu failed {}\n{}:\t{}".format(
alert_config['id'], r.status_code, r.text))
except Exception as e:
logger.error("Unhandled exception for alert_id:{} when posting"
"to Sensu: {}".format(alert_config['id'], str(e)))
# payload = dict(token=slack_token, channel=channel,
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
# r = None
# try:
# r = requests.post(slack_url, data=payload)
# assert r.status_code == 200
# # Record an Slack alert sent event
# tag_dict['alert_channel_type'] = "Slack"
# tag_dict['who'] = "slack:{}".format(channel)
# send_stat("alert_channel", 1, tag_dict)
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
# except AssertionError:
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
# except Exception as e:
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
# str(e)))
def send_metrics(alert, value, result, gaugename='stats'):
"""
Sends the results from the alert check to statsd
Args:
alert: The Alert config object that holds the alert['tag'] value.
gaugename: The name of the gauge metric we send.
value: The value we want to send as a gauge.
result: The result object from making the call. Use the data in this
object to tag the metric.
Returns: None
"""
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
# SPECIFIC ALERTS
result_tags = list(itertools.chain(
*[result['tags'][x] for x in alert['tags']]))
tag_dict = dict()
for x in range(len(alert['tags'])):
tag_dict[alert['tags'][x]] = result_tags[x]
tag_dict['alert'] = alert['id']
# SEND THE METRIC
send_stat(gaugename, value, tag_dict)
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
"""Sends stats value to statsd"""
client = StatsClient('telegraf', 8125, statprefix)
# SUBMIT STATS
client.gauge(gaugename, value, tags=tag_dict)
def has_custom_alert_routing(alert_config):
"""Checks if alert has custom routing"""
return 'lookup' in alert_config['alerts']
def get_alert_tags(alert_config, query_result):
"""Retrieves custom tags from alert"""
query_tags = []
for tag in alert_config['alerts']['lookup']['tags']:
if (alert_config.get('query_type') == 'prometheus' and
'metric' in query_result and
tag in query_result['metric']):
query_tags.append(query_result['metric'][tag])
elif ('tags' in query_result and tag in query_result['tags']
and query_result['tags'][tag]):
query_tags.append(query_result['tags'][tag][0])
return tuple(query_tags)

View File

@@ -0,0 +1,123 @@
import unittest
class Mock_Alert_Config() :
def __init__(self) :
self.cache = {}
self.level = {}
self.id = "id"
def set_level(self, k, v) :
self.level[k] = v
def get_level(self, k) :
if not k in self.level :
return None
return self.level[k]
def init_for_tags(self, *args) :
pass
def occurrences(self) :
return 1
def get_threshold(self, upper, warning) :
if warning :
return None, False
if upper :
return 10, True
else :
return 0, True
def get_tags(self) :
return "tagsC, tagsD".split(", ")
def set_for_tags(self, key, value) :
if not key in self.cache :
self.cache[key] = 0
self.cache[key] = value
def get_for_tags(self, key) :
if not key in self.cache :
self.cache[key] = 0
return self.cache[key]
class Mock_Result() :
def __init__(self) :
pass
def __getitem__(self, key) :
if key == "tags" :
return self
else :
return key
class Mock_Logger() :
def __init__(self) :
for k in ["error", "warn", "debug", "info", "warning"] :
setattr(self, k, self.log)
def log(self, *args) :
pass
class Test_Alert(unittest.TestCase) :
def test_set_tags(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), None, None, -1, 11)
self.assertEqual(al.get_tags(), "instance")
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
self.assertEqual(al.get_tags(), "tagsA, tagsB")
al.set_tags("a, b, c", res)
self.assertEqual(al.get_tags(), "a, b, c")
al.set_tags("a, b, c", res)
self.assertEqual(al.get_tags(), "a, b, c")
def test_firing(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 11)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 9)
self.assertTrue(al.get_firing())
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 9)
self.assertFalse(al.get_firing())
def test_str(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
alert = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
self.assertEqual(alert.name(), "Metric: id for tagsA, tagsB")
self.assertEqual(alert.body(), "")
def test_occurrences(self) :
import alert
ac = Mock_Alert_Config()
res = Mock_Result()
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
self.assertEqual(False, al.occurrences_breached)
al.set_occurrences()
al.set_occurrences()
al.set_occurrences()
self.assertEqual(False, al.occurrences_breached)
self.assertEqual(0, ac.get_for_tags(al.get_tags()))
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 11)
self.assertEqual(True, al.occurrences_breached)
al.set_occurrences()
al.set_occurrences()
al.set_occurrences()
self.assertEqual(True, al.occurrences_breached)
self.assertEqual(4, ac.get_for_tags(al.get_tags()))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,33 @@
import unittest
import alert_factory
class Mock_Alert() :
def __init__(self, *args) :
self.args = args
class Mock_Logger() :
def __init__(self) :
self.info = self.log
self.warn = self.log
self.warning = self.log
self.error = self.log
self.debug = self.log
def log(self, *args, **kwargs) :
print(args, kwargs)
class Test_Alert_Factory(unittest.TestCase) :
def setUp(self) :
self.was = alert_factory.Alert
alert_factory.Alert = Mock_Alert
def tearDown(self) :
alert_factory.Alert = self.was
def test(self) :
af = alert_factory.Alert_Factory(None, Mock_Logger())
alert = af.build(0, 5, None, "tagA, tagB", False, "tagC, tagD")
self.assertTrue(type(alert) == Mock_Alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,8 @@
import unittest
class Test_Service(unittest.TestCase) :
def test(self) :
raise Exception("not impl")
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold(unittest.TestCase) :
def test(self) :
import threshold
tl = threshold.Threshold(5)
self.assertFalse(tl.can_breach())
self.assertFalse(tl.exceeds(7))
self.assertFalse(tl.exceeds(3))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold_Lower(unittest.TestCase) :
def test(self) :
import threshold_lower
tl = threshold_lower.Threshold_Lower(5)
self.assertTrue(tl.can_breach)
self.assertTrue(tl.exceeds(3))
self.assertFalse(tl.exceeds(7))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Threshold_Upper(unittest.TestCase) :
def test(self) :
import threshold_upper
tl = threshold_upper.Threshold_Upper(5)
self.assertTrue(tl.can_breach)
self.assertTrue(tl.exceeds(7))
self.assertFalse(tl.exceeds(3))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,157 @@
import unittest
class Mock_Alert_Config() :
def __init__(self) :
self.upCrit = 10
self.lowCrit = 1
def get_threshold(self, upper, warn) :
if upper and warn :
return None, False
elif upper and not warn :
return self.upCrit, True
elif not upper and warn :
return None, False
else:
return self.lowCrit, True
class Test_Thresholds(unittest.TestCase) :
def test_breached_both(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit+1)
should_fire = [
t.critical_breached(),
t.lower_breached(),
t.upper_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.LOWER),
t.end_breached(t.UPPER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.LOWER),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
should_not_fire = [
t.warning_breached(),
t.level_breached(t.WARNING),
t.get_breached(level=t.WARNING),
]
for i in range(len(should_not_fire)) :
self.assertFalse(should_not_fire[i], i)
def test_breached_lower(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit)
should_fire = [
t.critical_breached(),
t.lower_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.LOWER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.LOWER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
should_not_fire = [
t.warning_breached(),
t.upper_breached(),
t.level_breached(t.WARNING),
t.end_breached(t.UPPER),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_not_fire)) :
self.assertFalse(should_not_fire[i], i)
def test_breached_upper(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
t.set_breached(alert_config.lowCrit, alert_config.upCrit+1)
should_fire = [
t.critical_breached(),
t.upper_breached(),
t.level_breached(t.CRITICAL),
t.end_breached(t.UPPER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(end=t.UPPER),
]
for i in range(len(should_fire)) :
self.assertTrue(should_fire[i], i)
for i in [
t.warning_breached(),
t.lower_breached(),
t.level_breached(t.WARNING),
t.end_breached(t.LOWER),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.LOWER),
] :
self.assertFalse(i)
def test_breached_notset(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
for i in [
t.warning_breached(),
t.critical_breached(),
t.upper_breached(),
t.lower_breached(),
t.level_breached(t.CRITICAL),
t.level_breached(t.WARNING),
t.end_breached(t.UPPER),
t.end_breached(t.LOWER),
t.get_breached(),
t.get_breached(level=t.CRITICAL),
t.get_breached(level=t.WARNING),
t.get_breached(end=t.UPPER),
t.get_breached(end=t.LOWER),
] :
self.assertFalse(i)
def test_get_matching(self) :
import thresholds
alert_config = Mock_Alert_Config()
t = thresholds.Thresholds(alert_config)
self.assertEqual(4, len([i for i in t.get_thresholds_matching()]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.CRITICAL)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.WARNING)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.UPPER)]))
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.UPPER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.LOWER)]))
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.UPPER)]))
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,19 @@
class Threshold() :
def __init__(self, threshold) :
self.threshold = threshold
self.breached = False
def can_breach(self) :
return False
def set_breached(self, value) :
self.breached = self.exceeds(value)
def get_breached(self) :
return self.breached
def exceeds(self, value) :
return False
def get_threshold(self) :
return self.threshold

View File

@@ -0,0 +1,8 @@
from threshold import Threshold
class Threshold_Lower(Threshold) :
def exceeds(self, value) :
return self.threshold > value
def can_breach(self) :
return True

View File

@@ -0,0 +1,8 @@
from threshold import Threshold
class Threshold_Upper(Threshold) :
def exceeds(self, value) :
return self.threshold < value
def can_breach(self) :
return True

View File

@@ -0,0 +1,67 @@
from threshold_upper import Threshold_Upper
from threshold_lower import Threshold_Lower
from threshold import Threshold
class Thresholds() :
WARNING = True
CRITICAL = False
UPPER = True
LOWER = False
def __init__(self, alert_config) :
self.alert_config = alert_config
self.thresholds = {}
for level in [ Thresholds.WARNING, Thresholds.CRITICAL ] :
self.thresholds[level] = {}
for end in [ Thresholds.UPPER, Thresholds.LOWER ] :
constructor = Threshold_Upper
if end == Thresholds.LOWER :
constructor = Threshold_Lower
self.thresholds[level][end] = self.create_threshold(end, level, constructor)
def create_threshold(self, isUpper, isWarning, constructor) :
value, has = self.alert_config.get_threshold(isUpper, isWarning)
if not has :
constructor = Threshold
return constructor(value)
def warning_breached(self) :
return self.level_breached(Thresholds.WARNING)
def critical_breached(self) :
return self.level_breached(Thresholds.CRITICAL)
def upper_breached(self) :
return self.end_breached(Thresholds.UPPER)
def lower_breached(self) :
return self.end_breached(Thresholds.LOWER)
def level_breached(self, level) :
return self.get_breached(level=level)
def end_breached(self, end) :
return self.get_breached(end=end)
def can_breach(self) :
can_breach = [t for t in self.thresholds.get_thresholds_matching() if not type(t) is Threshold]
return len(can_breach) > 0
def get_breached(self, level=None, end=None) :
for threshold in self.get_thresholds_matching(level=level, end=end) :
if threshold.get_breached() :
return True
return False
def set_breached(self, min_value, max_value) :
for threshold in self.get_thresholds_matching(end=Thresholds.LOWER) :
threshold.set_breached(min_value)
for threshold in self.get_thresholds_matching(end=Thresholds.UPPER) :
threshold.set_breached(max_value)
def get_thresholds_matching(self, level=None, end=None) :
for l in self.thresholds :
if level is None or l == level :
for e in self.thresholds[l] :
if end is None or e == end :
yield self.thresholds[l][e]

View File

@@ -0,0 +1,14 @@
import unittest
class Test_Alert_Config(unittest.TestCase):
def test(self) :
from alert_config import Alert_Config
try :
Alert_Config(None)
self.fail("did not fail on nil yaml_config")
except Exception :
pass
self.assertEqual("a", Alert_Config({"id":"a"}).id)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,54 @@
import unittest
import alert_config_list
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Alert_Config_List(unittest.TestCase):
def setUp(self) :
self.was = alert_config_list.Alert_Config
alert_config_list.Alert_Config = Mock_Alert_Config
self.al = alert_config_list.Alert_Config_List()
def tearDown(self) :
alert_config_list.Alert_Config = self.was
self.al = None
def test_add(self) :
self.al.add(Mock_Alert_Config("a"))
self.assertEqual(len(self.al), 1)
self.al.add([Mock_Alert_Config("a")])
self.assertEqual(len(self.al), 1)
self.al.add([Mock_Alert_Config("b")])
self.assertEqual(len(self.al), 2)
self.al.add(Mock_Alert_Config("c"))
self.assertEqual(len(self.al), 3)
other = alert_config_list.Alert_Config_List()
other.add(Mock_Alert_Config("d"))
self.al.add(other)
self.assertEqual(len(self.al), 4)
def test_compare(self) :
self.al.add(Mock_Alert_Config("a"))
self.al.add(Mock_Alert_Config("b"))
self.al.add(Mock_Alert_Config("c"))
new = alert_config_list.Alert_Config_List()
new.add(Mock_Alert_Config("a"))
new.add(Mock_Alert_Config("y"))
new.add(Mock_Alert_Config("z"))
added, removed, modified = self.al.compare(new)
if not "y" in added or not "z" in added :
self.fail("added is missing elements")
if not "b" in removed or not "c" in removed :
self.fail("removed is missing elements")
if not "a" in modified :
self.fail("modified is missing elements")
if __name__ == "__main__" :
unittest.main()

34
AoM_Service/library/test_job.py Executable file
View File

@@ -0,0 +1,34 @@
import unittest
import job
class Mock_Subprocess() :
called = False
joined = False
pid = None
def __init__(self) :
pass
def call(self, *args, **kwargs) :
self.called = True
def join(self, *args, **kwargs) :
self.joined = True
class Test_Job(unittest.TestCase):
def setUp(self) :
self.was = job.subprocess
self.subprocess = Mock_Subprocess()
job.subprocess = self.subprocess
def tearDown(self) :
job.subprocess = self.was
def test(self) :
p = Mock_Subprocess()
j = job.Job("id", p)
j.kill()
self.assertEqual(p.joined, True)
self.assertEqual(self.subprocess.called, True)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,50 @@
import unittest
import job_list
class Mock_Job() :
def __init__(self, id, p) :
self.id = id
def kill(self) :
return
class Test_Job_List(unittest.TestCase):
def setUp(self) :
self.was = job_list.Job
job_list.Job = Mock_Job
def tearDown(self) :
job_list.Job = self.was
def test_add(self) :
jl = job_list.Job_List()
self.assertEqual(len(jl), 0)
try :
jl.add(None)
self.fail("can add nil to job_list")
except Exception :
pass
jl.add(Mock_Job("a", "a"))
self.assertEqual(len(jl), 1)
jl.add(Mock_Job("a", "a"))
self.assertEqual(len(jl), 1)
jl.add(Mock_Job("b", "b"))
self.assertEqual(len(jl), 2)
other = job_list.Job_List()
other.add(Mock_Job("b", "b"))
other.add(Mock_Job("c", "c"))
jl.add(other)
self.assertEqual(len(jl), 3)
jl.kill("a")
self.assertEqual(len(jl), 2)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,31 @@
import unittest
import process
class Mock_Multiprocessing():
def __init__(self, *args, **kwargs) :
self.args = args
self.kwargs = kwargs
def get_target(self) :
return None
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Process(unittest.TestCase):
def setUp(self) :
self.was = process.multiprocessing.Process
process.multiprocessing.Process = Mock_Multiprocessing
def tearDown(self) :
process.multiprocessing.Process = self.was
def test(self) :
class MockProcess(process.Process) :
def get_target(self) :
return None
p = MockProcess(Mock_Alert_Config("a"), {}, None, True)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,36 @@
import unittest
import process_factory
class Mock_Process_Prometheus() :
def __init__(self, *args, **kwargs) :
pass
class Mock_Process_Kairos() :
def __init__(self, *args, **kwargs) :
pass
class Mock_Alert_Config() :
def __init__(self, type) :
self.t = type
def type(self) :
return self.t
class Test_Process_Factory(unittest.TestCase):
def setUp(self) :
self.was_prom = process_factory.process_prometheus.Process_Prometheus
self.was_kai = process_factory.process_kairos.Process_Kairos
process_factory.process_prometheus.Process_Prometheus = Mock_Process_Prometheus
process_factory.process_kairos.Process_Kairos = Mock_Process_Kairos
def tearDown(self) :
process_factory.process_prometheus.Process_Prometheus = self.was_prom
process_factory.process_kairos.Process_Kairos = self.was_kai
def test(self) :
factory = process_factory.Process_Factory(None, None, None)
self.assertTrue(type(factory.build(Mock_Alert_Config("a"))) is Mock_Process_Kairos)
self.assertTrue(type(factory.build(Mock_Alert_Config("prometheus"))) is Mock_Process_Prometheus)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,15 @@
import unittest
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Process_Kairos(unittest.TestCase):
def test(self) :
import process_kairos
from serviceapp import service
p = process_kairos.Process_Kairos(Mock_Alert_Config("a"), None, None, None)
self.assertEqual(p.get_target(), service.check_kairosdb_alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,15 @@
import unittest
class Mock_Alert_Config() :
def __init__(self, id) :
self.id = id
class Test_Process_Prometheus(unittest.TestCase):
def test(self) :
import process_prometheus
from serviceapp import service
p = process_prometheus.Process_Prometheus(Mock_Alert_Config("a"), None, None, None)
self.assertEqual(p.get_target(), service.check_prometheus_alert)
if __name__ == "__main__" :
unittest.main()

View File

@@ -0,0 +1,100 @@
import unittest
from serviceapp import service as serviceapp
import time
import config
import service
class Mock_ServiceApp_Service() :
def __init__(self, *args, **kwargs) :
self.args = args
self.kwargs = kwargs
def send_stat(self, *args, **kwargs) :
return
class Mock_Logger() :
def __init__(self) :
self.lines = []
self.info = self.log
self.warn = self.log
self.warning = self.log
self.debug = self.log
self.error = self.log
def log(self, *args, **kwargs) :
self.lines.append("{}, {}".format(args, kwargs))
print(self.lines[-1])
def Mock_Sleep(t) :
return
def Mock_Get_Healthy(*args, **kwargs) :
return 0, 1
def Mock_Distribute_Configs(*args, **kwargs) :
return True
def Mock_Is_Valid(*args, **kwargs) :
return True
def ignore_warnings(test_func):
import warnings
def do_test(self, *args, **kwargs):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
test_func(self, *args, **kwargs)
return do_test
class Test_Service(unittest.TestCase) :
def setUp(self) :
self.mock_serviceapp_service = Mock_ServiceApp_Service
self.was_k = serviceapp.check_kairosdb_alert
self.was_p = serviceapp.check_prometheus_alert
self.was_service = service.service
self.was_sleep = time.sleep
self.was_get_healthy = config.get_healthy_nodes_and_index
self.was_distribute = config.distribute_configs
self.was_is_valid = config.is_valid
serviceapp.check_kairosdb_alert = self.mock_serviceapp_service
serviceapp.check_prometheus_alert = self.mock_serviceapp_service
config.get_healthy_nodes_and_index = Mock_Get_Healthy
config.distribute_configs = Mock_Distribute_Configs
config.is_valid = Mock_Is_Valid
serviceapp.sleep = Mock_Sleep
service.sleep = Mock_Sleep
time.sleep = Mock_Sleep
def tearDown(self) :
serviceapp = self.was_service
serviceapp.check_kairosdb_alert = self.was_k
serviceapp.check_prometheus_alert = self.was_p
config.get_healthy_nodes_and_index = self.was_get_healthy
config.distribute_configs = self.was_distribute
config.is_valid = self.was_is_valid
time.sleep = self.was_sleep
serviceapp.sleep = self.was_sleep
service.sleep = self.was_sleep
@ignore_warnings
def test(self) :
import service
logger = Mock_Logger()
s = service.Service(logger, 100, "HOST", {
"alert_folder": "./testdata",
"alert_routing_config": {},
})
global first
first = True
def f() :
global first
is_first = first
first = False
return is_first
def purge_stale(*args) :
return
s.is_running = f
s.purge_stale = purge_stale
s.start()
if __name__ == "__main__" :
unittest.main()

20
AoM_Service/library/testdata/engine.yaml vendored Executable file
View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_engine_failing
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 24
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

18
AoM_Service/library/testdata/fuel.yaml vendored Executable file
View File

@@ -0,0 +1,18 @@
---
id: sleeper_agents_milleniumfalcon_fuellevel_low
service: fuel
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1

20
AoM_Service/library/testdata/lightspeed.yaml vendored Executable file
View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
service: captain
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 48
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['core']

20
AoM_Service/library/testdata/shields.yaml vendored Executable file
View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_shields_unavailable
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
suppressed_occurrences_threshold: 54
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

42
AoM_Service/publish.sh Executable file
View File

@@ -0,0 +1,42 @@
#!/bin/bash
GIT_COMMIT=$(git rev-parse HEAD)
if [[ $GIT_COMMIT == "" ]]; then
echo "--Missing required GIT_COMMIT var. Aborting..."
exit 1
fi
#Setup useful vars
team="engvis"
app="alert-on-metrics-app"
registryV2="registry-app.eng.qops.net:5001"
pathV2="${registryV2}/${team}/${app}"
commitV2="${pathV2}:${GIT_COMMIT}"
latestV2="${pathV2}:latest"
# In case you use relative paths
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
cd $DIR
echo "--Publishing $app $GIT_COMMIT"
echo "--Removing old image, so they don't accumulate"
docker rmi $latestV2
#Now fail if anything doesn't work
set -e
if [ -f $app/build.sh ]
then
echo "--Running pre build steps"
$app/build.sh
fi
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
echo "--Publishing app container"
docker push $commitV2
docker push $latestV2

48
AoM_Service/qvolution.sh Executable file
View File

@@ -0,0 +1,48 @@
function _get_and_save_secret() {
function is_set() {
local name="$1"
eval "echo \$$name" | grep . > /dev/null
}
local name="$1"
eval "$name=\${$name:-}"
if ! is_set $name; then
eval "$name=$(security find-generic-password -a $USER -s $name -w 2> /dev/null)"
if ! is_set "$name"; then
eval "read -s -p 'Enter $name: ' $name" >&2
eval "security add-generic-password -a $USER -s $name -w \$$name" >&2
echo "" >&2
fi
fi
eval "echo \$$name"
}
function get_and_save_secret() {
_get_and_save_secret "$@" | tail -n 1
}
SENSU_API_USER="$(get_and_save_secret SENSU_API_USER)"
SENSU_API_PASS="$(get_and_save_secret SENSU_API_PASS)"
SLACK_API_TOKEN="$(get_and_save_secret SLACK_API_TOKEN)"
echo SENSU_USER=$SENSU_API_USER >&2
echo SENSU_PASS=$SENSU_API_PASS >&2
echo SLACK_TOKEN=$SLACK_API_TOKEN >&2
git submodule update --remote
rm -rf alert_configs
cp -r AoM_Configs/alert_configs .
docker build -t aom:dev .
docker rm -f aom
docker run \
-e SLACK_API_TOKEN=${SLACK_API_TOKEN} \
-e API_USER=$SENSU_API_USER \
-e API_PASS=$SENSU_API_PASS \
--rm \
-d \
-p 8080:8080 \
--add-host telegraf:10.4.13.53 \
--name aom \
--add-host consul.service.consul:127.0.0.1 \
-h 127.0.0.1 \
aom:dev &
until curl localhost:8080/healthcheck; do sleep 1; done
docker logs -f aom

7
AoM_Service/requirements.txt Executable file
View File

@@ -0,0 +1,7 @@
PyYAML
pip
setuptools
requests
pyaml
sanic
statsd-tags

63
AoM_Service/run.sh Executable file
View File

@@ -0,0 +1,63 @@
#!/bin/ash
(
while true; do
redis-server
sleep 10
done
) &
/usr/src/app/echo-server &
/usr/src/app/echo-server -p 443 &
/usr/src/app/consul &
# Default values
KAIROSDB_URL=${KAIROSDB_URL:-http://kairosdb-metrics.service.eng.consul:8080/}
SMTP_SERVER=${SMTP_SERVER:-internal-smtp1-app.eng.qops.net:2525}
#SENSU_URL=${SENSU_URL:-https://sensu-api.eng.qops.net:443/results}
#SLACK_TOKEN=${SLACK_TOKEN:-xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81}
#VICTOROPS_URL=${VICTOROPS_URL:-https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/}
#CONSUL_URL=${CONSUL_URL:-http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock}
#AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-https://grafana.eng.qops.net/d/000000113/alert-on-metrics?refresh=1m&orgId=1&var-dc=All&var-fqdn=All&from=now-6h&to=now&var-id=}
#UCHIWA_URL=${UCHIWA_URL:-https://uchiwa-app.eng.qops.net/#/client/EngOps/AOM}
SLACK_TOKEN=${SLACK_TOKEN:-na}
VICTOROPS_URL=${VICTOROPS_URL:-http://localhost:41912/}
CONSUL_URL=${CONSUL_URL:-http://localhost:41912/}
AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-http://localhost:41912/}
UCHIWA_URL=${UCHIWA_URL:-http://localhost:41912/}
SENSU_URL=${SENSU_URL:-http://localhost:41912}
export AOM_GRAFANA_URL
# Update config
sed -i "s#{{{KAIROSDB_URL}}}#${KAIROSDB_URL}#g" service.yaml
sed -i "s#{{{VICTOROPS_URL}}}#${VICTOROPS_URL}#g" service.yaml
sed -i "s#{{{SLACK_TOKEN}}}#${SLACK_TOKEN}#g" service.yaml
sed -i "s#{{{SMTP_SERVER}}}#${SMTP_SERVER}#g" service.yaml
sed -i "s#{{{CONSUL_URL}}}#${CONSUL_URL}#g" service.yaml
sed -i "s#{{{SENSU_URL}}}#${SENSU_URL}#g" service.yaml
sed -i "s,{{{UCHIWA_URL}}},${UCHIWA_URL},g" service.yaml
# Starting service
if [ -n "${TEST}" ]; then
sed -i '/alert_reload_interval:/ s/[0-9]\+/30/g' service.yaml
python3 /usr/src/app/aom_service.py &
sleep 17
echo "Making current server leader"
curl localhost:8080/override?enable=true
echo "Starting the service"
curl localhost:8080/healthcheck
exec python3 /usr/src/app/aom_test.py
if [ $? -ne 0 ]; then
cat /usr/src/app/logs/aom_service.log
echo "Test failed!"
exit 1
else
cat /usr/src/app/logs/aom_service.log
echo "Test succeeded. Exiting"
exit 0
fi
else
exec python3 /usr/src/app/reporter/incoming/main.py &
exec python3 /usr/src/app/aom_service.py
fi

27
AoM_Service/service.yaml Executable file
View File

@@ -0,0 +1,27 @@
#=======================#
# All them URLS and tokens
#=======================#
kairosdb_url: "{{{KAIROSDB_URL}}}"
victorops_url: "{{{VICTOROPS_URL}}}"
slack_url: "https://slack.com/api/chat.postMessage"
slack_token: "{{{SLACK_TOKEN}}}"
smtp_server: "{{{SMTP_SERVER}}}"
consul_url: "{{{CONSUL_URL}}}"
sensu_endpoint: "{{{SENSU_URL}}}"
uchiwa_url: "{{{UCHIWA_URL}}}"
#=======================#
# Logging Information
#=======================#
log_path: "logs/aom_service.log"
#=======================#
# alerts configurations
#=======================#
alert_folder: "alert_configs"
alert_routing_lookup: "alert_routing_lookup"
alert_reload_interval: 300
#=======================#
# request timeout value
#=======================#
timeout: 90

82
mock/consul/main.go Executable file
View File

@@ -0,0 +1,82 @@
package main
import (
"flag"
"fmt"
"log"
"net/http"
"os"
"strings"
)
func main() {
p := os.Getenv("PORT")
flag.StringVar(&p, "p", "8500", "port to listen on")
flag.Parse()
http.Handle("/v1/catalog/service/alert-on-metrics", http.HandlerFunc(catalogService))
http.Handle("/v1/health/node/127.0.0.1", http.HandlerFunc(healthNode))
log.Println("Listening on", p)
if err := http.ListenAndServe(":"+strings.TrimPrefix(p, ":"), nil); err != nil {
panic(err)
}
}
func healthNode(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, `
[
{
"CheckID": "check_healthcheck_alert-on-metrics_alert-on-metrics",
"CreateIndex": 727094265,
"Definition": {},
"ModifyIndex": 727094265,
"Name": "Serf Health Status",
"Node": "gobs2-nomad.b1-prv.qops.net",
"Notes": "",
"Output": "Agent alive and reachable",
"ServiceID": "",
"ServiceName": "",
"ServiceTags": [],
"Status": "passing"
}
]
`)
}
func catalogService(w http.ResponseWriter, r *http.Request) {
fmt.Fprintln(w, `
[
{
"Address": "127.0.0.1",
"CreateIndex": 231035602,
"Datacenter": "eng",
"ID": "95dace59-f06b-d483-a06e-38288dc2019a",
"ModifyIndex": 231035602,
"Node": "127.0.0.1",
"NodeMeta": {
"consul-network-segment": ""
},
"ServiceAddress": "",
"ServiceConnect": {},
"ServiceEnableTagOverride": false,
"ServiceID": "alert-on-metrics",
"ServiceKind": "",
"ServiceMeta": {},
"ServiceName": "alert-on-metrics",
"ServicePort": 8080,
"ServiceProxy": {},
"ServiceProxyDestination": "",
"ServiceTags": [
""
],
"ServiceWeights": {
"Passing": 1,
"Warning": 1
},
"TaggedAddresses": {
"lan": "127.0.0.1",
"wan": "127.0.0.1"
}
}
]
`)
}

2
sandbox/isFiringRedis/config Executable file
View File

@@ -0,0 +1,2 @@
save
appendonly no

BIN
sandbox/isFiringRedis/dump.rdb Executable file

Binary file not shown.

12
sandbox/isFiringRedis/main.py Executable file
View File

@@ -0,0 +1,12 @@
def main(args) :
import redis
redis = redis.Redis()
k = "key"
v = "value"
print(redis.get(k).decode())
redis.set(k, v)
print(redis.get(k).decode())
if __name__ == "__main__" :
from sys import argv
main(argv)

15
sleeper_agents_aom_engine/.gitignore vendored Executable file
View File

@@ -0,0 +1,15 @@
# Created by .ignore support plugin (hsz.mobi)
### Vagrant template
.vagrant/
.idea/
build/results
logs/
*.pyc
.dockerignore
Dockerfile
build/builder
site-packages.tar.gz
alert_configs
AoM_Configs

Some files were not shown because too many files have changed in this diff Show More