cold
This commit is contained in:
23
Agents
Executable file
23
Agents
Executable file
@@ -0,0 +1,23 @@
|
|||||||
|
- Supressor - New AoM field to declare dependencies' alerts - if any in dependencies' alerts are firing, then do not check
|
||||||
|
x AoM is not AoM - it's just configurations
|
||||||
|
- Each AoM is a process dedicated to looping over 1 alert
|
||||||
|
- Kills & recreates processes on config change (?)
|
||||||
|
- Oh my god monolithic functions
|
||||||
|
- Insert suppress at comment "send all alerts found to the alert handlers..."
|
||||||
|
- No unittests
|
||||||
|
- Seems no multiplicity
|
||||||
|
- serviceapp/service.py
|
||||||
|
- Floyd-Warshall create fully connected graph on boot/MR as CSV
|
||||||
|
- Reporter - Slack bot to get graph/latest check by name
|
||||||
|
- Lookup AoM configs in Gitlab - fetch all on interval with PAT
|
||||||
|
- configs stored in docker image
|
||||||
|
- seem to be reloadable on MRs
|
||||||
|
- Execute query and return
|
||||||
|
- See nexpose for prometheus, kairos API
|
||||||
|
- matplotlob.pyplot
|
||||||
|
- last N values
|
||||||
|
- warning threshold
|
||||||
|
- critical threshold
|
||||||
|
- Visualizer - New AoM field to declare service name and dependent services' names - visible map of services as alerts firing and links between
|
||||||
|
- Hit uchiwa API for what's firing? How to handle silenced?
|
||||||
|
- Does AoM have an API for what's firing?
|
||||||
12
AoM_Service/.gitignore
vendored
Executable file
12
AoM_Service/.gitignore
vendored
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
# Created by .ignore support plugin (hsz.mobi)
|
||||||
|
### Vagrant template
|
||||||
|
.vagrant/
|
||||||
|
.idea/
|
||||||
|
build/results
|
||||||
|
logs/
|
||||||
|
|
||||||
|
*.pyc
|
||||||
|
.dockerignore
|
||||||
|
Dockerfile
|
||||||
|
build/builder
|
||||||
|
site-packages.tar.gz
|
||||||
67
AoM_Service/.jenkins/JenkinsFile
Executable file
67
AoM_Service/.jenkins/JenkinsFile
Executable file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env groovy
|
||||||
|
pipeline {
|
||||||
|
agent {label 'nomad-builder'}
|
||||||
|
|
||||||
|
environment {
|
||||||
|
DOCKER_HOST = '127.0.0.1:2375'
|
||||||
|
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Info') {
|
||||||
|
steps {
|
||||||
|
sh script: 'hostname'
|
||||||
|
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Build') {
|
||||||
|
steps {
|
||||||
|
echo "No build required"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Test') {
|
||||||
|
steps {
|
||||||
|
echo "Test done during merge request"
|
||||||
|
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Deploy') {
|
||||||
|
steps {
|
||||||
|
script {
|
||||||
|
if ("$GIT_BRANCH" == "origin/master"){
|
||||||
|
echo "Running publish script"
|
||||||
|
sh script: './publish.sh'
|
||||||
|
echo "Triggering Rundeck job"
|
||||||
|
script {
|
||||||
|
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c5323400-0d97-4488-8cf2-1d736a5f7fb9', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
echo "No deploy step required."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
post {
|
||||||
|
success {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test passed, update commit with green checkbox
|
||||||
|
}
|
||||||
|
// Notify Eng Viz of successful build
|
||||||
|
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
failure {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test failed, update commit status with red x
|
||||||
|
error("Build failed, check ${BUILD_URL} for details.")
|
||||||
|
}
|
||||||
|
// On failure send an email to Eng Vis
|
||||||
|
mail body: 'Please check ${BUILD_URL} or details.',
|
||||||
|
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
|
||||||
|
from: 'Jenkins',
|
||||||
|
to: 'eng-visibility@qualtrics.com'
|
||||||
|
// Finally send a warning message to Eng Vis slack channel
|
||||||
|
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
58
AoM_Service/.jenkins/JenkinsFileMR
Executable file
58
AoM_Service/.jenkins/JenkinsFileMR
Executable file
@@ -0,0 +1,58 @@
|
|||||||
|
#!/usr/bin/env groovy
|
||||||
|
pipeline {
|
||||||
|
agent {label 'nomad-builder'}
|
||||||
|
|
||||||
|
environment {
|
||||||
|
DOCKER_HOST = '127.0.0.1:2375'
|
||||||
|
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Info') {
|
||||||
|
steps {
|
||||||
|
sh script: 'hostname'
|
||||||
|
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Build') {
|
||||||
|
steps {
|
||||||
|
echo "Building AOM container"
|
||||||
|
sh script: 'docker build . -t aom_test_container'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Test') {
|
||||||
|
steps {
|
||||||
|
echo "Launching container on test mode. It will take a few minutes."
|
||||||
|
sh script: 'docker run -e TEST=true -h $(hostname) --add-host=\"telegraf:$(nslookup jenkins.eng.qops.net|grep Server | awk \'{print $2}\')\" aom_test_container'
|
||||||
|
echo "Removing docker image and container"
|
||||||
|
sh script: 'docker rmi -f aom_test_container'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Deploy') {
|
||||||
|
steps {
|
||||||
|
echo "No deploy step required for Merge Request"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
post {
|
||||||
|
success {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test passed, update commit with green checkbox
|
||||||
|
}
|
||||||
|
// Notify Eng Viz of successful build
|
||||||
|
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
failure {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test failed, update commit status with red x
|
||||||
|
error("Build failed, check ${BUILD_URL} for details.")
|
||||||
|
}
|
||||||
|
// On failure send an email to Eng Vis
|
||||||
|
mail body: 'Please check ${BUILD_URL} or details.',
|
||||||
|
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
|
||||||
|
from: 'Jenkins',
|
||||||
|
to: 'eng-visibility@qualtrics.com'
|
||||||
|
// Finally send a warning message to Eng Vis slack channel
|
||||||
|
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
12
AoM_Service/AoM_Configs/.gitignore
vendored
Executable file
12
AoM_Service/AoM_Configs/.gitignore
vendored
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
|
||||||
|
# ignore alert configs starting with underscore -- we can create the while testing the webapp
|
||||||
|
# and not have to worry about them getting into the repo
|
||||||
|
alert_configs/_*.yaml
|
||||||
|
|
||||||
|
*.swp
|
||||||
|
.idea/
|
||||||
|
.vagrant/
|
||||||
|
__pycache__
|
||||||
|
logs/
|
||||||
|
venv/
|
||||||
|
.vscode/
|
||||||
68
AoM_Service/AoM_Configs/.jenkins/JenkinsFile
Executable file
68
AoM_Service/AoM_Configs/.jenkins/JenkinsFile
Executable file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env groovy
|
||||||
|
pipeline {
|
||||||
|
agent {label 'nomad-builder'}
|
||||||
|
|
||||||
|
environment {
|
||||||
|
DOCKER_HOST = 'tcp://127.0.0.1:2375'
|
||||||
|
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Info') {
|
||||||
|
steps {
|
||||||
|
sh script: 'hostname'
|
||||||
|
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Build') {
|
||||||
|
steps {
|
||||||
|
echo "No build required"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Test') {
|
||||||
|
steps {
|
||||||
|
echo "Test done already on merge request"
|
||||||
|
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
|
||||||
|
// sh script: 'cd build; ./test_changed.sh'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Deploy') {
|
||||||
|
steps {
|
||||||
|
script {
|
||||||
|
if ("$GIT_BRANCH" == "origin/master"){
|
||||||
|
echo "Running publish script"
|
||||||
|
sh script: './publish.sh'
|
||||||
|
echo "Triggering Rundeck job"
|
||||||
|
script {
|
||||||
|
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c1f0dd4e-89a0-411b-afbb-455421a2ba34', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
echo "No deploy step required."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
post {
|
||||||
|
success {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test passed, update commit with green checkbox
|
||||||
|
}
|
||||||
|
// Notify Eng Viz of successful build
|
||||||
|
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
failure {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test failed, update commit status with red x
|
||||||
|
error("Build failed, check ${BUILD_URL} for details.")
|
||||||
|
}
|
||||||
|
// On failure send an email to Eng Vis
|
||||||
|
mail body: 'Please check ${BUILD_URL} or details.',
|
||||||
|
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
|
||||||
|
from: 'Jenkins',
|
||||||
|
to: 'eng-visibility@qualtrics.com'
|
||||||
|
// Finally send a warning message to Eng Vis slack channel
|
||||||
|
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
56
AoM_Service/AoM_Configs/.jenkins/JenkinsFileMR
Executable file
56
AoM_Service/AoM_Configs/.jenkins/JenkinsFileMR
Executable file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/usr/bin/env groovy
|
||||||
|
pipeline {
|
||||||
|
agent {label 'nomad-builder'}
|
||||||
|
|
||||||
|
environment {
|
||||||
|
DOCKER_HOST = 'tcp://127.0.0.1:2375'
|
||||||
|
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Info') {
|
||||||
|
steps {
|
||||||
|
sh script: 'hostname'
|
||||||
|
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Build') {
|
||||||
|
steps {
|
||||||
|
echo "No build required"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Test') {
|
||||||
|
steps {
|
||||||
|
echo "Running test"
|
||||||
|
sh script: './test_changed.sh'
|
||||||
|
sh script: 'python validate_yaml.py'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stage('Deploy') {
|
||||||
|
steps {
|
||||||
|
echo "No deploy step required for Merge Request"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
post {
|
||||||
|
success {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test passed, update commit with green checkbox
|
||||||
|
}
|
||||||
|
// Notify Eng Viz of successful build
|
||||||
|
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
failure {
|
||||||
|
gitlabCommitStatus(name: "$JOB_NAME") {
|
||||||
|
// Test failed, update commit status with red x
|
||||||
|
error("Build failed, check ${BUILD_URL} for details.")
|
||||||
|
}
|
||||||
|
// On failure send an email to Eng Vis
|
||||||
|
mail body: 'Please check ${BUILD_URL} or details.',
|
||||||
|
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
|
||||||
|
from: 'Jenkins',
|
||||||
|
to: 'eng-visibility@qualtrics.com'
|
||||||
|
// Finally send a warning message to Eng Vis slack channel
|
||||||
|
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
16
AoM_Service/AoM_Configs/Dockerfile.webapp
Executable file
16
AoM_Service/AoM_Configs/Dockerfile.webapp
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
FROM registry-app.eng.qops.net:5001/imported/alpine:3.9
|
||||||
|
MAINTAINER Engineering Visibility <eng-visibility@qualtrics.com>
|
||||||
|
|
||||||
|
COPY webapp_requirements.txt /
|
||||||
|
COPY run_webapp.sh /
|
||||||
|
|
||||||
|
RUN apk add --no-cache python3 curl
|
||||||
|
RUN apk add --no-cache --virtual .build-deps build-base python3-dev \
|
||||||
|
&& pip3 install --no-cache-dir --upgrade pip \
|
||||||
|
&& pip3 install --no-cache-dir --upgrade setuptools \
|
||||||
|
&& pip3 install --no-cache-dir --upgrade -r /webapp_requirements.txt \
|
||||||
|
&& apk del .build-deps \
|
||||||
|
&& rm -rf /var/cache/apk/*
|
||||||
|
|
||||||
|
CMD ["/run_webapp.sh"]
|
||||||
|
|
||||||
236
AoM_Service/AoM_Configs/README.md
Executable file
236
AoM_Service/AoM_Configs/README.md
Executable file
@@ -0,0 +1,236 @@
|
|||||||
|
# README
|
||||||
|
|
||||||
|
This is the new repository for the Alert On Metrics project configurations.
|
||||||
|
|
||||||
|
Alert On Metrics (AOM) project allows one to setup alerts to trigger based on tracking a metric value as collected via [Metrics as a Service](https://odo.corp.qualtrics.com/wiki/index.php/Metrics_As_A_Service). You "track" your metric via a [KairosDB query](http://kairosdb-metrics.service.eng.consul:8080/) or [Prometheus query](http://big-trickster.service.eng.consul:9090/graph) so you are not limited to raw metrics - you can sample based on aggregators available in KairosDB to create new metrics views or use PromQL if you are using Prometheus. Typically people use min, max or count. All "tracked" metrics are rewritten to the metrics data store as a new metric *telgraf.aom_stats_value* but are tagged by Alert-On-Metrics to show their origin.
|
||||||
|
|
||||||
|
You can trigger an alert based on any combination of the following:
|
||||||
|
|
||||||
|
- An upper critical threshold based on the value of a metric increasing
|
||||||
|
- An upper warning threshold based on the value of a metric increasing
|
||||||
|
- A lower critical threshold based on the value of a metric decreasing
|
||||||
|
- A lower warning threshold based on the value of a metric decreasing
|
||||||
|
- Combine any lower and upper threshold to create a 'band'
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Sensu and alert subdue. NEW!
|
||||||
|
|
||||||
|
Some changes have been introduced into latest AOM versions. Now alerts
|
||||||
|
can be sent through Sensu (email not supported yet). Using Sensu also
|
||||||
|
allows to create check dependencies (vo is now victorops for Sensu).
|
||||||
|
|
||||||
|
```
|
||||||
|
alerts:
|
||||||
|
sensu:
|
||||||
|
victorops:
|
||||||
|
'blackhole'
|
||||||
|
slack:
|
||||||
|
'#aom_test_channel'
|
||||||
|
dependencies:
|
||||||
|
- name_of_check1
|
||||||
|
- name_of_check2
|
||||||
|
```
|
||||||
|
|
||||||
|
Also filters option has been enabled. It works the same way as in
|
||||||
|
Hiera. If you only want to receive critical alerts through one channel
|
||||||
|
you can set "channel"_subdue to **true**.
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```
|
||||||
|
filters:
|
||||||
|
slack_subdue: true
|
||||||
|
victorops_subdue: false
|
||||||
|
```
|
||||||
|
You can make use of anything that sensu api supports. Anything you add
|
||||||
|
to your configuration under sensu will be sent directly to the Sensu API.
|
||||||
|
|
||||||
|
---
|
||||||
|
## Availability metric.
|
||||||
|
|
||||||
|
If you want to track how long your check is on CRITICAL state along a
|
||||||
|
given period of time, you can enable this feature by setting this
|
||||||
|
option to true:
|
||||||
|
```
|
||||||
|
availability: true
|
||||||
|
```
|
||||||
|
|
||||||
|
This will start sending metrics constantly and recording the check
|
||||||
|
output. You can then visualize this metric within the following
|
||||||
|
[dashboard]
|
||||||
|
(https://grafana.eng.qops.net/d/5OsrZSdiz/aom-availability?orgId=1)
|
||||||
|
(or you can create your own).
|
||||||
|
To get a more accurate result don't set the refresh interval lower
|
||||||
|
than 60 seconds.
|
||||||
|
|
||||||
|
---
|
||||||
|
## Routing per tag value. NEW!
|
||||||
|
This feature allows you to configure a different alert routing using the values of tags in your metric. For instance, let's say you want to have a different alert policy for beta, gamma and prod:
|
||||||
|
* *beta*: I want to alert my `#my-project-dev` channel
|
||||||
|
* *gamma*: I want to alert my `#my-project-gamma` channel
|
||||||
|
* *prod*: I want to alert my `#my-project` channel and page the on-call on VictorOps
|
||||||
|
|
||||||
|
We can use the `dc` tag available in the metric query, define specific configuration for beta and gamma, and use a default one for all other values (prod in this case). Everything is configured inside the `alerts` object in the yaml configuration. Instead of directly adding the alert configuration, add a `lookup` key. Inside, you have to provide three values:
|
||||||
|
* `default`: the alert policy to apply by default if we can't find a configuration for a specific combination of tags. The format is the exact same as classic alerts (sensu, vo, slack, etc.).
|
||||||
|
* `tags`: the tags that will be used to lookup the alert routing configuration. You can use more than one tag.
|
||||||
|
* `lookups`: an array, where each element specifies a combination of tag values and the routing to apply in this case.
|
||||||
|
|
||||||
|
Here is the configuration of our example:
|
||||||
|
```yaml
|
||||||
|
alerts:
|
||||||
|
lookup:
|
||||||
|
default:
|
||||||
|
sensu:
|
||||||
|
slack: my-project
|
||||||
|
victorops: my-on-call-key
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
lookups:
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: my-project-dev
|
||||||
|
tags:
|
||||||
|
dc: b1-prv
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: my-project-gamma
|
||||||
|
tags:
|
||||||
|
dc: g1-iad
|
||||||
|
```
|
||||||
|
|
||||||
|
You can move the `lookups` part inside a separate file, so it can be reused accross different AOM configurations. To do that, instead of a `lookups` key, provide a `lookup_file` with the filename, including the extension:
|
||||||
|
```yaml
|
||||||
|
alerts:
|
||||||
|
lookup:
|
||||||
|
default: ...
|
||||||
|
lookup_file: my_lookup_file.yaml
|
||||||
|
tags: ...
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Save this file under the `alert_routing_lookup` folder. The syntax for the alert routing is the same as before, it is just in a different file:
|
||||||
|
```yaml
|
||||||
|
---
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: my-project-dev
|
||||||
|
tags:
|
||||||
|
dc: b1-prv
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: my-project-gamma
|
||||||
|
tags:
|
||||||
|
dc: g1-iad
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
## How do I register a new alert with AOM?
|
||||||
|
|
||||||
|
Alert configurations for AOM are just a Kairos DB or Prometheus query
|
||||||
|
specified in a yaml format and wrapped in some controlling
|
||||||
|
configuration that determines how frequently the query is executed,
|
||||||
|
thresholds, occurrences and where to route the alerts. We have built a
|
||||||
|
small UI that is packaged with the AOM gitlab project that will help
|
||||||
|
you generate a suitable yaml configuration. You can rehearse your
|
||||||
|
queries on the [KairosDB UI]
|
||||||
|
(http://kairosdb-metrics.service.eng.consul:8080/) or at any
|
||||||
|
Prometheus endpoint and take a look at other examples in the alert_configs/ folder for help.
|
||||||
|
|
||||||
|
Follow the instructions below to launch the yaml generator UI on your
|
||||||
|
local desktop and use it to generate a merge request (Docker is
|
||||||
|
necessary).
|
||||||
|
|
||||||
|
1. Clone the project
|
||||||
|
2. cd into the project's directory
|
||||||
|
3. Run the script ./generate_config.sh
|
||||||
|
4. Once up, navigate in a browser to **localhost:80/**
|
||||||
|
5. Fill out the form and click generate
|
||||||
|
6. Hit **Crlt+C** when you have the alert configuration
|
||||||
|
7. Submit the merge request in a new branch
|
||||||
|
|
||||||
|
---
|
||||||
|
This process will starts a local webserver that provides a convenient interface for generating the yaml you need.
|
||||||
|
Most of the fields have helpful info tips on what each value is and how it's used.
|
||||||
|
|
||||||
|
---
|
||||||
|
## Visualization tool [BETA]
|
||||||
|
Along with the project, a simple python script to show how your
|
||||||
|
metrics will look like and to help you setting the thresholds, is
|
||||||
|
provided. This tool requires the installation of python3 and some
|
||||||
|
additional python3 modules:
|
||||||
|
1. yaml
|
||||||
|
2. json
|
||||||
|
3. requests
|
||||||
|
4. numpy
|
||||||
|
5. matplotlib
|
||||||
|
|
||||||
|
These modules should be easy to install using 'pip' or 'homebrew'.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
```python3 show_config.py [X] alertname_without_yaml_extension```
|
||||||
|
|
||||||
|
Where X is an optional parameter to define the interval lenght you
|
||||||
|
want to display. It's a multiplier factor, set to 10 by default, that
|
||||||
|
will increase the start_relative (so you will see more datapoints).
|
||||||
|
|
||||||
|
The script should open a window showing the metrics along the defined
|
||||||
|
thresholds. If the query doesn't return any value, it will exit.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How does my new alert get to production?
|
||||||
|
|
||||||
|
Once you submit a merge request, a Jenkins' job will quickly validate your alert
|
||||||
|
files just checking it contains all required fields and proper syntax. Setting up
|
||||||
|
appropriate thresholds and alerting channels (VictorOps, email,
|
||||||
|
Slack) is user's responsibility.
|
||||||
|
|
||||||
|
If Jenkins returns a PASS result for the test, new alert files will be
|
||||||
|
merged into the master branch and a deploy job will be triggered (also
|
||||||
|
from Jenkins). AOM service will be actively looking for changes in the
|
||||||
|
alert_configs folder and will pick up any changes (by default every
|
||||||
|
300 seconds).
|
||||||
|
|
||||||
|
## Helpful Tidbits
|
||||||
|
|
||||||
|
__IMPORTANT:__ The alert id field must be unique, it might be useful running the
|
||||||
|
grep command within the alert_configs directory to ensure it's not
|
||||||
|
already defined.
|
||||||
|
|
||||||
|
Use the [UI](http://kairosdb-metrics.service.eng.consul:8080/) on the kairosdb box to help you generate / determine the proper query.
|
||||||
|
Remember, you want to get the query down to just one or 2 entries per *group-by* so that the service can quickly iterate over it.
|
||||||
|
|
||||||
|
Once the request has been merged you can check if your query is getting processed by [hitting the url](http://alert-on-metrics.service.eng.consul:8080/healthcheck?verbose=true)
|
||||||
|
|
||||||
|
You can also check out the [grafana dashboard](http://grafana-metrics.service.eng.consul:3000/dashboard/db/alert-on-metrics) that has the results of this service's queries and verify your alert metric is showing up regularly.
|
||||||
|
|
||||||
|
From KairosDB's doc: *You must specify either start_absolute or start_relative but not
|
||||||
|
both. Similarly, you may specify either end_absolute or end_relative
|
||||||
|
but not both. If either end time is not specified the current date and
|
||||||
|
time is assumed.* We suggest the usage of *end_relative* (greater than
|
||||||
|
1 minute) as this will make steadier graphs (if you draw a graph until
|
||||||
|
*Now*, some of the latest metrics could be missing so the end of the
|
||||||
|
graph will be lower than it should).
|
||||||
|
|
||||||
|
We do not recommend using *align_sampling* and *align_start_time*
|
||||||
|
(both false by default so can be skipped) as they might change the alignment of metrics
|
||||||
|
and change graphs over time (*If more than one are set, unexpected results will occur*).
|
||||||
|
|
||||||
|
If you have any doubt about KairosDB's query metrics you can take a look at their documentation [here](https://kairosdb.github.io/docs/build/html/restapi/QueryMetrics.html).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## The Gotchas
|
||||||
|
|
||||||
|
1. Alerts only fire when KairosDB returns a result. If your KairosDB metric query returns no results for X (currently 10) attempts any active alerts will clear with a message explaining that AOM could not get any further results from KairosDB so user must manually verify RECOVERY. Earlier versions of AOM had no flap protection like this built in. Long term we will move alerting to Sensu which has more advanced built in flap protection. You can reduce flapping of results by building your Kairos query well. Please talk to engineering visibility for help with this.
|
||||||
|
2. Metrics are only collected every 60 seconds, so setting an interval below that will automatically get bumped up to 60 seconds from the web based config generation. Match up the interval by how often the metric is collected and measured
|
||||||
|
3. The Email field only requires a list of names, and not the @qualtrics bit, as it will only send to qualtrics addresses using the internal-smtp1-app.eng.qops.net box
|
||||||
|
4. Email and Slack alerts fire once during an event. This way if an outtage was occuring, you wouldn't get flooded with emails and slack alerts the entire time.
|
||||||
|
5. Email and Slack alerts can be helpful to share with the team so they are aware of what is happening.
|
||||||
|
6. Email and Slack alerts can be helpful when trying to figure out your alerts before you VO stuff
|
||||||
|
|
||||||
20
AoM_Service/AoM_Configs/alert_configs/engine.yaml
Executable file
20
AoM_Service/AoM_Configs/alert_configs/engine.yaml
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_engine_failing
|
||||||
|
service: core
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
suppressed_occurrences_threshold: 24
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['fuel']
|
||||||
18
AoM_Service/AoM_Configs/alert_configs/fuel.yaml
Executable file
18
AoM_Service/AoM_Configs/alert_configs/fuel.yaml
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_fuellevel_low
|
||||||
|
service: fuel
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
20
AoM_Service/AoM_Configs/alert_configs/lightspeed.yaml
Executable file
20
AoM_Service/AoM_Configs/alert_configs/lightspeed.yaml
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
|
||||||
|
service: captain
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
suppressed_occurrences_threshold: 48
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['core']
|
||||||
20
AoM_Service/AoM_Configs/alert_configs/shields.yaml
Executable file
20
AoM_Service/AoM_Configs/alert_configs/shields.yaml
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_shields_unavailable
|
||||||
|
service: core
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
suppressed_occurrences_threshold: 54
|
||||||
|
start_time: '-60'
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['fuel']
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
slack:
|
||||||
|
- "public-api-deploy-tst"
|
||||||
|
tags:
|
||||||
|
canaryTest: transaction_import_distribution_1
|
||||||
|
targetdc: fra1
|
||||||
@@ -0,0 +1,365 @@
|
|||||||
|
---
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
tags:
|
||||||
|
brandId: aexpfeedback
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: emea-alerts
|
||||||
|
victorops: profserv-19
|
||||||
|
tags:
|
||||||
|
brandId: airbuswea
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alaskaair
|
||||||
|
tags:
|
||||||
|
brandId: alaskaair
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-3
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: amdocs
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: americanairlines
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: anz
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-3
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: arris
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: emea-alerts
|
||||||
|
victorops: profserv-19
|
||||||
|
tags:
|
||||||
|
brandId: baincx
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: bmocx
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwgroupne
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwgroupnest3
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwjapan
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwjapanst3
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwna
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwnast3
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwvertriebsgmbh
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: bmwvertriebsgmbhst3
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: caterpillar
|
||||||
|
victorops: profserv-14
|
||||||
|
tags:
|
||||||
|
brandId: catcustomerinsights
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: century-link
|
||||||
|
victorops: xmp-seattle-4
|
||||||
|
tags:
|
||||||
|
brandId: centurylink
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-4
|
||||||
|
victorops: xmp-seattle-4
|
||||||
|
tags:
|
||||||
|
brandId: ciscoengineering
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: clientdashboards
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: cms
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: TODO
|
||||||
|
tags:
|
||||||
|
brandId: cocacolaperform
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: dish
|
||||||
|
tags:
|
||||||
|
brandId: dishvoc
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
tags:
|
||||||
|
brandId: dowcorning
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: drtoddhall
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-gs-compare
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: goldmansachs
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: harvard
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: ibm
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-3
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: jcibuildings
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-3
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: johnsoncontrols2
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: kubota
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: liberty-mutual
|
||||||
|
tags:
|
||||||
|
brandId: libertymutualvoc
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-qe-alerts
|
||||||
|
victorops: es-bmw-marriott
|
||||||
|
tags:
|
||||||
|
brandId: marriottvacationclub
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
tags:
|
||||||
|
brandId: mastercard
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-4
|
||||||
|
victorops: xmp-seattle-4
|
||||||
|
tags:
|
||||||
|
brandId: nielsenapac
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: TODO
|
||||||
|
tags:
|
||||||
|
brandId: optumrx
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-4
|
||||||
|
victorops: xmp-seattle-4
|
||||||
|
tags:
|
||||||
|
brandId: nielsenscarborough
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-3
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: rogers
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
tags:
|
||||||
|
brandId: samsungeurope
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: emea-alerts
|
||||||
|
victorops: profserv-19
|
||||||
|
tags:
|
||||||
|
brandId: telenorreporting
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: thermoking
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: philips-es
|
||||||
|
tags:
|
||||||
|
brandId: tnsnipophilips
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: travelers_coord
|
||||||
|
victorops: profserv-14
|
||||||
|
tags:
|
||||||
|
brandId: travelers
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: uhcdr
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: uhcmr
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: uhcgm
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: TODO
|
||||||
|
tags:
|
||||||
|
brandId: uhg
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: uhg1
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: underarmour
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: unum
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: TODO
|
||||||
|
tags:
|
||||||
|
brandId: usaast3
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-sea-automations
|
||||||
|
tags:
|
||||||
|
brandId: usbank
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: es-alerts
|
||||||
|
victorops: profserv
|
||||||
|
tags:
|
||||||
|
brandId: uscd
|
||||||
|
-
|
||||||
|
alert:
|
||||||
|
sensu:
|
||||||
|
slack: xmp-seattle-3
|
||||||
|
victorops: xmp-seattle-3
|
||||||
|
tags:
|
||||||
|
brandId: walkersandbox
|
||||||
30
AoM_Service/AoM_Configs/aom_webapp.py
Executable file
30
AoM_Service/AoM_Configs/aom_webapp.py
Executable file
@@ -0,0 +1,30 @@
|
|||||||
|
#! /usr/bin/python3
|
||||||
|
# aom_builder.py
|
||||||
|
# point of the builder is to generate a valid yaml config that could be read in to the main app by
|
||||||
|
# asking for clarifying questions on what to check and how to alert on it
|
||||||
|
# this comes from 4 questions:
|
||||||
|
# When to query
|
||||||
|
# What to query for
|
||||||
|
# Whats an alert
|
||||||
|
# Who to Alert
|
||||||
|
|
||||||
|
from webapp import app
|
||||||
|
from library.logger import AlertLogging
|
||||||
|
from library.args import get_builder_args
|
||||||
|
|
||||||
|
log = AlertLogging('aom')
|
||||||
|
log.start()
|
||||||
|
log.start_log_file("logs/aom_builder.log")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# GET ARGS AND START LOGGING
|
||||||
|
args = get_builder_args()
|
||||||
|
# logger.init("logs/aom_builder.log", args['log_level'])
|
||||||
|
# aom_logger = logging.getLogger(__name__)
|
||||||
|
log.info("Logger Initialized")
|
||||||
|
# ENABLE SESSIONS TO KEEP YAML FILE STATE BETWEEN PAGES
|
||||||
|
log.info("Starting webapp")
|
||||||
|
app.run(host='localhost', port=args['port'], debug=True)
|
||||||
|
|
||||||
|
|
||||||
16
AoM_Service/AoM_Configs/generate_config.sh
Executable file
16
AoM_Service/AoM_Configs/generate_config.sh
Executable file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
trap ctrl_c INT
|
||||||
|
|
||||||
|
function ctrl_c() {
|
||||||
|
docker stop aom_web
|
||||||
|
docker ps -a | awk '{ print $1,$2 }' | grep aom_web | awk '{print $1 }' | xargs -I {} docker rm {}
|
||||||
|
}
|
||||||
|
|
||||||
|
docker build -f Dockerfile.webapp -t aom_web . && \
|
||||||
|
|
||||||
|
docker run -d -v$(pwd):/web -p80:5000 --name aom_web aom_web && \
|
||||||
|
|
||||||
|
docker logs -f aom_web
|
||||||
|
|
||||||
|
|
||||||
0
AoM_Service/AoM_Configs/library/__init__.py
Executable file
0
AoM_Service/AoM_Configs/library/__init__.py
Executable file
84
AoM_Service/AoM_Configs/library/args.py
Executable file
84
AoM_Service/AoM_Configs/library/args.py
Executable file
@@ -0,0 +1,84 @@
|
|||||||
|
# Contians the arg parser options.
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def get_builder_args():
|
||||||
|
"""
|
||||||
|
Gets the arguments passed in to the aom_builder main call
|
||||||
|
|
||||||
|
:return: parser object
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description="Generates a valid yaml file for alerting on metrics. "
|
||||||
|
"If you are familiar with the yaml structure for an alert"
|
||||||
|
"you don't have to use this builder, it's just convenient")
|
||||||
|
parser.add_argument('-q', '--query', help="The Kariosdb query string to use")
|
||||||
|
parser.add_argument('-i', '--interval', type=int, default=60, help="The interval that the check will run. "
|
||||||
|
"This value is in seconds")
|
||||||
|
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The upper threshold is the value that when reached will cause an alert "
|
||||||
|
"depending on the threshold logic. "
|
||||||
|
"Use in conjunction with lower threshold to define a normal band.")
|
||||||
|
parser.add_argument('-b', '--lowerthreshold', help="The lower threshold is the value that when reached will cause an alert "
|
||||||
|
"depending on the threshold logic"
|
||||||
|
"Use in conjunction with upper threshold to define a normal band.")
|
||||||
|
parser.add_argument('-m', '--measure', choices=['gt', 'lt', 'eq'], help="The measure to use to compare the "
|
||||||
|
"threshold to the values of the alerts")
|
||||||
|
parser.add_argument('-a', '--alert_config', help='A valid Yaml representation of your alerting block')
|
||||||
|
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_builder run. "
|
||||||
|
"[0=Error, 1=Info, 2=Debug]")
|
||||||
|
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
|
||||||
|
|
||||||
|
return args_to_dict(parser)
|
||||||
|
|
||||||
|
def get_tester_service_args():
|
||||||
|
"""
|
||||||
|
Gets arguments passed into aom_tester.py
|
||||||
|
Returns: parser object
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description="Parameters to start the alerting on metrics dummy tester service")
|
||||||
|
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_service app"
|
||||||
|
"[0=Error, 1=Info, 2=Debug]")
|
||||||
|
parser.add_argument('-a', '--alert_configs', default=None,
|
||||||
|
help="If provided will override the folder location read from the config with the value passed "
|
||||||
|
"in. Is helpful for testing and troubleshooting alerts")
|
||||||
|
parser.add_argument('--hostname', help="If provided, will override the actual hostname check with this value")
|
||||||
|
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
|
||||||
|
return args_to_dict(parser)
|
||||||
|
|
||||||
|
def get_service_args():
|
||||||
|
"""
|
||||||
|
Gets arguments passed into aom_service.py
|
||||||
|
Returns: parser object
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description="Parameters to start the alerting on metrics service")
|
||||||
|
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_service app"
|
||||||
|
"[0=Error, 1=Info, 2=Debug]")
|
||||||
|
parser.add_argument('-a', '--alert_configs', default=None,
|
||||||
|
help="If provided will override the folder location read from the config with the value passed "
|
||||||
|
"in. Is helpful for testing and troubleshooting alerts")
|
||||||
|
parser.add_argument('-o', '--override', action='store_true', help="Overrides the check leader election value")
|
||||||
|
parser.add_argument('--hostname', help="If provided, will override the actual hostname check with this value")
|
||||||
|
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
|
||||||
|
return args_to_dict(parser)
|
||||||
|
|
||||||
|
|
||||||
|
def args_to_dict(parsed_args):
|
||||||
|
"""
|
||||||
|
Converts the argument parser object to a dict
|
||||||
|
Args:
|
||||||
|
parsed_args: Arg parser object
|
||||||
|
Returns:
|
||||||
|
Dictionary of arguments
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
arg_list = parsed_args.parse_args()
|
||||||
|
# RETURN A DICT OF ARGUMENTS
|
||||||
|
arg_dict = dict()
|
||||||
|
for val in vars(arg_list):
|
||||||
|
arg_dict[val] = getattr(arg_list, val)
|
||||||
|
return arg_dict
|
||||||
|
except argparse.ArgumentError:
|
||||||
|
parsed_args.print_help()
|
||||||
|
sys.exit(1)
|
||||||
22
AoM_Service/AoM_Configs/library/config.py
Executable file
22
AoM_Service/AoM_Configs/library/config.py
Executable file
@@ -0,0 +1,22 @@
|
|||||||
|
# config.py
|
||||||
|
import logging
|
||||||
|
import glob
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def glob_the_configs(config_path):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
config_path (string): relative path to the configs
|
||||||
|
Returns:
|
||||||
|
List of configs
|
||||||
|
"""
|
||||||
|
alert_list = []
|
||||||
|
for config_file in glob.glob(config_path + "/*.yaml"):
|
||||||
|
logger.debug("Found {} config".format(config_file))
|
||||||
|
# LOAD CONFIG
|
||||||
|
alert_list.append(yaml.load(open(config_file, 'rb').read()))
|
||||||
|
logger.info("Loaded {} configs".format(len(alert_list)))
|
||||||
|
return alert_list
|
||||||
118
AoM_Service/AoM_Configs/library/logger.py
Executable file
118
AoM_Service/AoM_Configs/library/logger.py
Executable file
@@ -0,0 +1,118 @@
|
|||||||
|
# logger.py
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import logging.handlers
|
||||||
|
import os
|
||||||
|
|
||||||
|
logging.getLogger('requests').setLevel(logging.ERROR)
|
||||||
|
logging.getLogger('urllib3').setLevel(logging.ERROR)
|
||||||
|
logging.getLogger('werkzeug').setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
|
||||||
|
class SingleLevelFilter(logging.Filter):
|
||||||
|
def __init__(self, passlevel, reject):
|
||||||
|
"""
|
||||||
|
initilizer(constructor) of the singlelevelfilter
|
||||||
|
@param passlevel (int) - the int value of the level of the log
|
||||||
|
@param reject (bool) - if true will return if the record level is not equal to the passlevel
|
||||||
|
@return SingleLevelFilter object
|
||||||
|
@note Sets some object parameters
|
||||||
|
"""
|
||||||
|
self.passlevel = passlevel
|
||||||
|
self.reject = reject
|
||||||
|
|
||||||
|
def filter(self, record):
|
||||||
|
"""
|
||||||
|
Returns True/False depending on parameters
|
||||||
|
@param record (Log int) - the record that the filter belongs to
|
||||||
|
@return bool - True/False depending on what self.reject is set to and what record.levelno and self.passlevel are set to
|
||||||
|
@note This causes either only logging of the exact same level to get logged, or only logging other than the same level to get logged
|
||||||
|
"""
|
||||||
|
if self.reject:
|
||||||
|
return (record.levelno != self.passlevel)
|
||||||
|
else:
|
||||||
|
return (record.levelno == self.passlevel)
|
||||||
|
|
||||||
|
|
||||||
|
class AlertLogging(logging.Logger):
|
||||||
|
"""
|
||||||
|
Class Object to handle the logging of the alert on metrics service
|
||||||
|
starts at Error level and can flip on (and add) an additional log file and
|
||||||
|
Debug logger as needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
"""
|
||||||
|
Inits the formaters and logger
|
||||||
|
"""
|
||||||
|
self.name = name
|
||||||
|
self.debug_formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - %(message)s", "%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
self.standard_formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s",
|
||||||
|
"%m-%d %H:%M:%S")
|
||||||
|
logging.getLogger()
|
||||||
|
logging.Logger.__init__(self, name, logging.DEBUG)
|
||||||
|
logging.setLoggerClass(AlertLogging)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
info_handler = logging.StreamHandler()
|
||||||
|
info_handler.setLevel(logging.INFO)
|
||||||
|
info_handler.setFormatter(self.standard_formatter)
|
||||||
|
self.addHandler(info_handler)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def start_log_file(self, file_path, mode='a'):
|
||||||
|
"""
|
||||||
|
Creates a separate log file handler
|
||||||
|
Args:
|
||||||
|
file_path: path to the log file
|
||||||
|
mode: the type of mode to open the file handler with
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.log_path = file_path
|
||||||
|
work_folder = os.path.dirname(file_path)
|
||||||
|
if len(work_folder) > 0 and not os.path.exists(work_folder):
|
||||||
|
os.makedirs(work_folder)
|
||||||
|
self.log_handler = logging.FileHandler(file_path, mode)
|
||||||
|
self.log_handler.setLevel(logging.DEBUG)
|
||||||
|
self.log_handler.setFormatter(self.debug_formatter)
|
||||||
|
self.addHandler(self.log_handler)
|
||||||
|
|
||||||
|
def stop_log_file(self):
|
||||||
|
"""
|
||||||
|
Closes Log file and sets the handler to None
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.log_handler.close()
|
||||||
|
self.removeHandler(self.log_handler)
|
||||||
|
self.log_handler = None
|
||||||
|
|
||||||
|
def start_debug(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.debug_handler = logging.StreamHandler()
|
||||||
|
self.debug_handler.setLevel(logging.DEBUG)
|
||||||
|
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
|
||||||
|
self.debug_handler.setFormatter(self.debug_formatter)
|
||||||
|
self.addHandler(self.debug_handler)
|
||||||
|
|
||||||
|
def stop_debug(self):
|
||||||
|
"""
|
||||||
|
stop the debugger
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.removeHandler(self.debug_handler)
|
||||||
|
self.debug_handler = None
|
||||||
42
AoM_Service/AoM_Configs/publish.sh
Executable file
42
AoM_Service/AoM_Configs/publish.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
GIT_COMMIT=$(git rev-parse HEAD)
|
||||||
|
|
||||||
|
if [[ $GIT_COMMIT == "" ]]; then
|
||||||
|
echo "--Missing required GIT_COMMIT var. Aborting..."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#Setup useful vars
|
||||||
|
team="engvis"
|
||||||
|
app="alert-on-metrics-configs"
|
||||||
|
|
||||||
|
registryV2="registry-app.eng.qops.net:5001"
|
||||||
|
pathV2="${registryV2}/${team}/${app}"
|
||||||
|
commitV2="${pathV2}:${GIT_COMMIT}"
|
||||||
|
latestV2="${pathV2}:latest"
|
||||||
|
|
||||||
|
# In case you use relative paths
|
||||||
|
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo "--Publishing $app $GIT_COMMIT"
|
||||||
|
|
||||||
|
echo "--Removing old image, so they don't accumulate"
|
||||||
|
docker rmi $latestV2
|
||||||
|
|
||||||
|
#Now fail if anything doesn't work
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ -f $app/build.sh ]
|
||||||
|
then
|
||||||
|
echo "--Running pre build steps"
|
||||||
|
$app/build.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
|
||||||
|
|
||||||
|
echo "--Publishing app container"
|
||||||
|
|
||||||
|
docker push $commitV2
|
||||||
|
docker push $latestV2
|
||||||
6
AoM_Service/AoM_Configs/run.sh
Executable file
6
AoM_Service/AoM_Configs/run.sh
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
rsync -a --delete /alert_configs/ /mountpoint/configs/git/
|
||||||
|
rsync -a --delete /alert_routing_lookup/ /mountpoint/alert_routing_lookup/
|
||||||
|
|
||||||
|
ls -l /mountpoint/configs/git/
|
||||||
5
AoM_Service/AoM_Configs/run_webapp.sh
Executable file
5
AoM_Service/AoM_Configs/run_webapp.sh
Executable file
@@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/ash
|
||||||
|
export FLASK_APP=/web/aom_webapp.py
|
||||||
|
export FLASK_DEBUG=1
|
||||||
|
|
||||||
|
cd /web; flask run --host=0.0.0.0
|
||||||
25
AoM_Service/AoM_Configs/service.yaml
Executable file
25
AoM_Service/AoM_Configs/service.yaml
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#=======================#
|
||||||
|
# All them URLS and tokens
|
||||||
|
#=======================#
|
||||||
|
kairosdb_url: "http://kairosdb-metrics.service.eng.consul:8080/"
|
||||||
|
victorops_url: "https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/"
|
||||||
|
slack_url: "https://slack.com/api/chat.postMessage"
|
||||||
|
slack_token: "xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81"
|
||||||
|
smtp_server: "internal-smtp1-app.eng.qops.net:2525"
|
||||||
|
consul_url: "http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock"
|
||||||
|
sensu_endpoint: "https://sensu-api.eng.qops.net:443/results"
|
||||||
|
|
||||||
|
#=======================#
|
||||||
|
# Logging Information
|
||||||
|
#=======================#
|
||||||
|
log_path: "logs/aom_service.log"
|
||||||
|
|
||||||
|
#=======================#
|
||||||
|
# alerts folder
|
||||||
|
#=======================#
|
||||||
|
alert_folder: "alert_configs"
|
||||||
|
|
||||||
|
#=======================#
|
||||||
|
# request timeout value
|
||||||
|
#=======================#
|
||||||
|
timeout: 90
|
||||||
104
AoM_Service/AoM_Configs/show_config.py
Executable file
104
AoM_Service/AoM_Configs/show_config.py
Executable file
@@ -0,0 +1,104 @@
|
|||||||
|
import glob
|
||||||
|
import yaml
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import numpy
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib.dates as mdates
|
||||||
|
import datetime
|
||||||
|
import random
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
#from pdb import set_trace as bp
|
||||||
|
|
||||||
|
timeout = 180
|
||||||
|
# if no argument print help and exit
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
print("You need to specify an alert config file.")
|
||||||
|
exit(1)
|
||||||
|
#else
|
||||||
|
config_file = 'alert_configs/'+sys.argv[1]+'.yaml'
|
||||||
|
|
||||||
|
# test file exists or exit
|
||||||
|
|
||||||
|
alert_config = yaml.load(open(config_file, 'rb').read())
|
||||||
|
|
||||||
|
# We will show 10 intervals by default
|
||||||
|
|
||||||
|
if len(sys.argv) == 3:
|
||||||
|
interval = int(sys.argv[2])
|
||||||
|
else:
|
||||||
|
interval = 10
|
||||||
|
alert_config['query']['start_relative']['value'] = str(int(alert_config['query']['start_relative']['value'])*interval)
|
||||||
|
|
||||||
|
kairosdb_url = "http://kairosdb-metrics.service.eng.consul:8080/"
|
||||||
|
|
||||||
|
query_url = os.path.join(kairosdb_url + "api/v1/datapoints/query")
|
||||||
|
#ret = requests.post(query_url, data=json.dumps(query), timeout)
|
||||||
|
ret = requests.post(query_url, json.dumps(alert_config['query']), timeout)
|
||||||
|
results = ret.json()['queries'][0]['results']
|
||||||
|
|
||||||
|
# Transforming to human readable data
|
||||||
|
# for result in results[0]['values']:
|
||||||
|
# result[0] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(result[0]/1000))
|
||||||
|
# result[0] = datetime.datetime.strptime(result[0],'%Y-%m-%d %H:%M:%S')
|
||||||
|
for result in results:
|
||||||
|
for value in result['values']:
|
||||||
|
# bp()
|
||||||
|
# transform date from epoch to human readable format
|
||||||
|
value[0] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value[0]/1000))
|
||||||
|
# transform date string to datetime object
|
||||||
|
value[0] = datetime.datetime.strptime(value[0],'%Y-%m-%d %H:%M:%S')
|
||||||
|
series = numpy.array(result['values'])
|
||||||
|
label_str = str(result['group_by'][0].get('group', ''))
|
||||||
|
line_color = tuple(numpy.random.random(size=3))
|
||||||
|
plt.plot_date(series[:,0],series[:,1], marker='.', color=line_color, linestyle='-', label=label_str)
|
||||||
|
#series = numpy.array(results[0]['values'])
|
||||||
|
#converted_dates = map(datetime.datetime.strptime, datelist, len(datelist)*['%Y-%m-%d %H:%M:%S'])
|
||||||
|
#x_axis = (converted_dates)
|
||||||
|
formatter = mdates.DateFormatter('%H:%M:%S')
|
||||||
|
|
||||||
|
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
|
||||||
|
# series = series.astype(numpy.unicode, copy=False)
|
||||||
|
ax = plt.subplot()
|
||||||
|
#ax.set_xlabel('TIME')
|
||||||
|
#ax.set_ylabel('VALUE')
|
||||||
|
#bc = plt.axes()
|
||||||
|
|
||||||
|
#bc.xaxis.set_major_formatter(formatter)
|
||||||
|
#plt.plot_date(series[:,0],series[:,1], marker='o', color='b', linestyle='-')
|
||||||
|
#plt.plot_date(converted_dates,series[:,1], marker='o', color='b', linestyle='-')
|
||||||
|
#ax.set_xticks(series[:,0])
|
||||||
|
#ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
|
||||||
|
#ax.xaxis.set_minor_formatter(mdates.DateFormatter("%Y-%m"))
|
||||||
|
# ax = plt.subplot.gcf().axes[0]
|
||||||
|
#ax.set_title(sys.argv[1])
|
||||||
|
ax.xaxis.set_major_formatter(formatter)
|
||||||
|
#plt.xaxis.set_major_formatter(formatter)
|
||||||
|
plt.title(sys.argv[1])
|
||||||
|
plt.legend()
|
||||||
|
# pyplot.gcf().autofmt_xdate(rotation=25)
|
||||||
|
#ax.xaxis_date()
|
||||||
|
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
|
||||||
|
# ax.xaxis.set_minor_formatter(mdates.DateFormatter("%Y-%m"))
|
||||||
|
# ax.plot(series[:,0],series[:,1], marker='o', color='b', linestyle='-')
|
||||||
|
myRe = re.compile('^(?!occurrences).*_threshold$')
|
||||||
|
# Adding thresholds to the graph
|
||||||
|
for key in alert_config:
|
||||||
|
if myRe.match(key):
|
||||||
|
plt.axhline(y=float(alert_config[key]), color='r', linestyle='--', label=str(key))
|
||||||
|
plt.text(series[0][0],float(alert_config[key]),key)
|
||||||
|
#plt.gcf().autofmt_xdate()
|
||||||
|
|
||||||
|
#ax = .add_axes([0,0,1,1])
|
||||||
|
|
||||||
|
plt.gcf().autofmt_xdate(rotation=25)
|
||||||
|
#plt.axhline(y=500000, color='o', linestyle='-')
|
||||||
|
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
#results[0]['values']
|
||||||
30
AoM_Service/AoM_Configs/test_changed.sh
Executable file
30
AoM_Service/AoM_Configs/test_changed.sh
Executable file
@@ -0,0 +1,30 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -x
|
||||||
|
|
||||||
|
if [ -z $GIT_COMMIT ]; then
|
||||||
|
echo "Expected env var 'GIT_COMMIT' to be set. Exiting..."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Check that only alert confings are being pushed"
|
||||||
|
echo "$PWD"
|
||||||
|
|
||||||
|
for file in $(git diff-tree -r --name-only ${GIT_COMMIT}^1 ${GIT_COMMIT}); do
|
||||||
|
new_id=$(grep ^id\: $file)
|
||||||
|
if [ ! -z "$new_id" ]; then
|
||||||
|
total_id=$(grep "$new_id" alert_configs/*.yaml | wc -l)
|
||||||
|
if [ $total_id -gt 1 ] ; then
|
||||||
|
echo "Duplicated id found! Please update the id of the alert configuration"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
dir=$(dirname ${file})
|
||||||
|
# alert_configs/ change triggers a test of the new or changed aler configs
|
||||||
|
if [ "$dir" == "alert_configs" ] || [ "$dir" == "alert_routing_lookup" ] ; then
|
||||||
|
echo "Good to merge"
|
||||||
|
exit 0
|
||||||
|
else
|
||||||
|
echo "Only automatic merges allowed for alert config files"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
60
AoM_Service/AoM_Configs/validate_yaml.py
Executable file
60
AoM_Service/AoM_Configs/validate_yaml.py
Executable file
@@ -0,0 +1,60 @@
|
|||||||
|
import yaml
|
||||||
|
import glob
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
alert_list = []
|
||||||
|
bad_alert_list = []
|
||||||
|
print("Collecting all yaml configs")
|
||||||
|
# COLLECT CONFIG FILES
|
||||||
|
for config_file in glob.glob("./alert_configs/*.yaml"):
|
||||||
|
print("Found {} config".format(config_file))
|
||||||
|
alert_list.append(config_file)
|
||||||
|
print("Collecting all yaml configs")
|
||||||
|
# PARSE CONFIG FILES AND VALIDATE THEIR VALUES
|
||||||
|
for alert in alert_list:
|
||||||
|
print("Validating file {}".format(alert))
|
||||||
|
try:
|
||||||
|
config = yaml.load(open(alert, 'rb').read())
|
||||||
|
assert len(config['alerts']) > 0, "No Alerts configured, this is a dead config"
|
||||||
|
assert len(config['query']) > 0, "No Query, this is a dead config"
|
||||||
|
assert config['interval'] >= 30, "Intervals less than 30 are invalid"
|
||||||
|
assert len(config['id']) > 0, "Alert ID is empty, this is a dead config"
|
||||||
|
if config.get('query_type') == 'prometheus':
|
||||||
|
assert type(config['query']) is str, "Invalid Prometheus query"
|
||||||
|
assert "$" not in config['query'], "Prometheus query should not contain variables"
|
||||||
|
else:
|
||||||
|
assert type(config['query']) is dict, "Kairosdb Query string cannot be validated as proper JSON"
|
||||||
|
defined_tags = set(config['query']['metrics'][0]['tags'].keys()).union({'','dc','fqdn'})
|
||||||
|
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
|
||||||
|
if 'group_by' in config['query']['metrics'][0]:
|
||||||
|
defined_tags.update(set(config['query']['metrics'][0]['group_by'][0]['tags']))
|
||||||
|
# for undefined_tag in set(config['tags']).difference(defined_tags):
|
||||||
|
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
|
||||||
|
# "prevent empty results".format(undefined_tag))
|
||||||
|
# OUR MINIMUM THRESHOLD NEED
|
||||||
|
assert 'critical_lower_threshold' in config or 'critical_upper_threshold' in config or \
|
||||||
|
'warning_lower_threshold' in config or 'warning_upper_threshold' in config, \
|
||||||
|
"Config must have at least one threshold set."
|
||||||
|
|
||||||
|
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING AFTER CRITICAL
|
||||||
|
if 'warning_lower_threshold' in config and 'critical_lower_threshold' in config:
|
||||||
|
assert config['critical_lower_threshold'] < config['warning_lower_threshold'], \
|
||||||
|
"Lower Critical must be less than Lower Warning"
|
||||||
|
if 'warning_upper_threshold' in config and 'critical_upper_threshold' in config:
|
||||||
|
assert config['critical_upper_threshold'] > config['warning_upper_threshold'], \
|
||||||
|
"Upper Critical must be greater than Upper Warning"
|
||||||
|
|
||||||
|
if 'occurrences_threshold' in config:
|
||||||
|
assert config['occurrences_threshold'] >= 1, \
|
||||||
|
"Having an occurrences value less than 2 is assumed and pointless to specify"
|
||||||
|
except Exception as e:
|
||||||
|
print("Invalid config file: {}\n{}".format(alert, str(e)))
|
||||||
|
bad_alert_list.append("{}\n{}".format(alert, str(e)))
|
||||||
|
# WRITE OUT BAD CONFIGS TO THE RESULTS FILE
|
||||||
|
# with open("./results/test_results.log", "w+") as f:
|
||||||
|
# for alert in bad_alert_list:
|
||||||
|
# f.write("Config is bad: {}".format(alert.replace('\n', ' ')))
|
||||||
|
for alert in bad_alert_list:
|
||||||
|
print("Config is bad: {}".format(alert.replace('\n', ' ')))
|
||||||
|
if bad_alert_list:
|
||||||
|
exit(1)
|
||||||
7
AoM_Service/AoM_Configs/webapp/__init__.py
Executable file
7
AoM_Service/AoM_Configs/webapp/__init__.py
Executable file
@@ -0,0 +1,7 @@
|
|||||||
|
from flask import Flask, render_template, request, session
|
||||||
|
app = Flask(__name__)
|
||||||
|
app.config['SESSION_TYPE'] = 'filesystem'
|
||||||
|
app.config['SECRET_KEY'] = 'super secret key'
|
||||||
|
|
||||||
|
import webapp.views
|
||||||
|
|
||||||
139
AoM_Service/AoM_Configs/webapp/render.py
Executable file
139
AoM_Service/AoM_Configs/webapp/render.py
Executable file
@@ -0,0 +1,139 @@
|
|||||||
|
import yaml
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import traceback
|
||||||
|
import sys
|
||||||
|
from library.logger import AlertLogging
|
||||||
|
|
||||||
|
logger = AlertLogging('aom')
|
||||||
|
logger.start()
|
||||||
|
|
||||||
|
|
||||||
|
def render_config(config):
|
||||||
|
"""
|
||||||
|
Reads in the config dict and renders to file. config usually from web interface
|
||||||
|
Args:
|
||||||
|
config: The config to use to generate the yaml file
|
||||||
|
Returns:
|
||||||
|
boolean string of 0 if successful and the yaml as string, or 1 and the error
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# GET THE NAME OF THE FILE FROM THE CONFIG
|
||||||
|
file_name = ''.join([config['alert_name'], '.yaml'])
|
||||||
|
logger.debug("Filename: {}".format(file_name))
|
||||||
|
# THIS SHOULD BE A PARAMETER PASSED IN
|
||||||
|
file_path = os.path.join('alert_configs', file_name)
|
||||||
|
logger.debug("Full path: {}".format(file_path))
|
||||||
|
# SANITIZE THE CONFIG TO A NEW OBJECT
|
||||||
|
yaml_config = {'alerts': {},
|
||||||
|
'id': config['alert_name'],
|
||||||
|
'interval': 30 if int(config['interval']) < 30 else int(config['interval'])}
|
||||||
|
# SET THE INTERVAL TO lowest value of 30 seconds
|
||||||
|
# SPLIT THE ALERTS INTO A LIST
|
||||||
|
if 'vo' in config:
|
||||||
|
yaml_config['alerts']['vo'] = [x for x in config['vo_list'].split(',') if x]
|
||||||
|
if 'email' in config:
|
||||||
|
yaml_config['alerts']['email'] = [x for x in config['email_list'].split(',') if x]
|
||||||
|
if 'slack' in config:
|
||||||
|
yaml_config['alerts']['slack'] = [x for x in config['slack_list'].split(',') if x]
|
||||||
|
# GET THRESHOLDS AS FLOATS
|
||||||
|
if 'critical_threshold' in config:
|
||||||
|
if config['critical_upper_threshold'] is not "":
|
||||||
|
yaml_config['critical_upper_threshold'] = float(config['critical_threshold'])
|
||||||
|
if 'critical_upper_threshold' in config:
|
||||||
|
if config['critical_upper_threshold'] is not "":
|
||||||
|
yaml_config['critical_upper_threshold'] = float(config['critical_upper_threshold'])
|
||||||
|
if 'warning_threshold' in config:
|
||||||
|
yaml_config['warning_upper_threshold'] = float(config['warning_threshold'])
|
||||||
|
if 'warning_upper_threshold' in config:
|
||||||
|
yaml_config['warning_upper_threshold'] = float(config['warning_upper_threshold'])
|
||||||
|
if 'critical_lower_threshold' in config:
|
||||||
|
if config['critical_lower_threshold'] is not "":
|
||||||
|
yaml_config['critical_lower_threshold'] = float(config['critical_lower_threshold'])
|
||||||
|
if 'warning_lower_threshold' in config:
|
||||||
|
yaml_config['warning_lower_threshold'] = float(config['warning_lower_threshold'])
|
||||||
|
if 'occurrences' in config:
|
||||||
|
yaml_config['occurrences_threshold'] = int(config['occurrences_threshold'])
|
||||||
|
# PARSE THE QUERY OUT INTO A DICT OBJECT
|
||||||
|
if config['prometheus_query']:
|
||||||
|
yaml_config['query_type'] = 'prometheus'
|
||||||
|
yaml_config['prometheus_url'] = config['prometheus_url']
|
||||||
|
yaml_config['query'] = config['prometheus_query']
|
||||||
|
yaml_config['start_time'] = config['start_time']
|
||||||
|
yaml_config['end_time'] = config['end_time']
|
||||||
|
else:
|
||||||
|
yaml_config['query_type'] = 'kairosdb'
|
||||||
|
yaml_config['query'] = json.loads(config['kairosdb_query'])
|
||||||
|
# GET THE TAGS, COMMA SEPARATED
|
||||||
|
tags = config['tags'].split(',')
|
||||||
|
yaml_config['tags'] = [x for x in tags if x]
|
||||||
|
# GET THE URL
|
||||||
|
yaml_config['url'] = config['url']
|
||||||
|
# WRITE TO FILE
|
||||||
|
yaml_str = yaml.dump(yaml_config, default_flow_style=False, explicit_start=True)
|
||||||
|
with open(file_path, 'w') as f:
|
||||||
|
f.write(yaml_str)
|
||||||
|
return 0, yaml_str
|
||||||
|
except json.decoder.JSONDecodeError:
|
||||||
|
return 1, "Query string is not valid json: {}".format(traceback.format_stack())
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unable to render yaml config file to disk")
|
||||||
|
_, _, ex_traceback = sys.exc_info()
|
||||||
|
return 1, render_traceback(e, ex_traceback)
|
||||||
|
|
||||||
|
|
||||||
|
def render_yaml(alert_id):
|
||||||
|
"""
|
||||||
|
Reads in a yaml file into the config that the web expects.
|
||||||
|
Args:
|
||||||
|
alert_id: then name of the config
|
||||||
|
Returns:
|
||||||
|
Dictionary
|
||||||
|
"""
|
||||||
|
file_name = ''.join([alert_id, '.yaml'])
|
||||||
|
file_path = os.path.join('alert_configs', file_name)
|
||||||
|
config = yaml.load(open(file_path, 'r').read())
|
||||||
|
yaml_config = dict()
|
||||||
|
yaml_config['alert_name'] = config['id']
|
||||||
|
yaml_config['interval'] = config['interval']
|
||||||
|
if 'critical_threshold' in config:
|
||||||
|
yaml_config['critical_upper_threshold'] = config['critical_threshold']
|
||||||
|
if 'critical_upper_threshold' in config:
|
||||||
|
yaml_config['critical_upper_threshold'] = config['critical_upper_threshold']
|
||||||
|
if 'critical_lower_threshold' in config:
|
||||||
|
yaml_config['critical_lower_threshold'] = config['critical_lower_threshold']
|
||||||
|
if 'warning_threshold' in config:
|
||||||
|
yaml_config['warning_upper_threshold'] = config['warning_threshold']
|
||||||
|
if 'warning_upper_threshold' in config:
|
||||||
|
yaml_config['warning_upper_threshold'] = config['warning_upper_threshold']
|
||||||
|
if 'warning_lower_threshold' in config:
|
||||||
|
yaml_config['warning_lower_threshold'] = config['warning_lower_threshold']
|
||||||
|
if 'occurrences_threshold' in config:
|
||||||
|
yaml_config['occurrences_threshold'] = config['occurrences_threshold']
|
||||||
|
yaml_config['url'] = config['url']
|
||||||
|
if 'email' in config['alerts']:
|
||||||
|
yaml_config['email'] = 'on'
|
||||||
|
yaml_config['email_list'] = ','.join(config['alerts']['email'])
|
||||||
|
if 'vo' in config['alerts']:
|
||||||
|
yaml_config['vo'] = 'on'
|
||||||
|
yaml_config['vo_list'] = ','.join(config['alerts']['vo'])
|
||||||
|
if 'slack' in config['alerts']:
|
||||||
|
yaml_config['slack'] = 'on'
|
||||||
|
yaml_config['slack_list'] = ','.join(config['alerts']['slack'])
|
||||||
|
if 'tags' in config:
|
||||||
|
yaml_config['tags'] = ','.join(config['tags'])
|
||||||
|
if config.get('query_type') == 'prometheus':
|
||||||
|
yaml_config['prometheus_query'] = config['query']
|
||||||
|
yaml_config['prometheus_url'] = config['prometheus_url']
|
||||||
|
yaml_config['start_time'] = config['start_time']
|
||||||
|
yaml_config['end_time'] = config['end_time']
|
||||||
|
else:
|
||||||
|
yaml_config['kairosdb_query'] = json.dumps(config['query'], sort_keys=True, indent=4, separators=(',', ': '))
|
||||||
|
return yaml_config
|
||||||
|
|
||||||
|
|
||||||
|
def render_traceback(ex, ex_traceback):
|
||||||
|
tb_lines = traceback.format_exception(ex.__class__, ex, ex_traceback)
|
||||||
|
logger.exception("Exception")
|
||||||
|
return '\n'.join(tb_lines)
|
||||||
14
AoM_Service/AoM_Configs/webapp/static/bootstrap-theme.min.css
vendored
Executable file
14
AoM_Service/AoM_Configs/webapp/static/bootstrap-theme.min.css
vendored
Executable file
File diff suppressed because one or more lines are too long
14
AoM_Service/AoM_Configs/webapp/static/bootstrap.min.css
vendored
Executable file
14
AoM_Service/AoM_Configs/webapp/static/bootstrap.min.css
vendored
Executable file
File diff suppressed because one or more lines are too long
11
AoM_Service/AoM_Configs/webapp/static/bootstrap.min.js
vendored
Executable file
11
AoM_Service/AoM_Configs/webapp/static/bootstrap.min.js
vendored
Executable file
File diff suppressed because one or more lines are too long
29
AoM_Service/AoM_Configs/webapp/static/style.css
Executable file
29
AoM_Service/AoM_Configs/webapp/static/style.css
Executable file
@@ -0,0 +1,29 @@
|
|||||||
|
body { font-family: sans-serif; background: #eee; }
|
||||||
|
a, h1, h2 { color: #377BA8; }
|
||||||
|
h1, h2 { font-family: 'Georgia', serif; margin: 0; }
|
||||||
|
h1 { border-bottom: 2px solid #eee; }
|
||||||
|
h2 { font-size: 1.2em; }
|
||||||
|
|
||||||
|
.page { margin: 2em auto; width: 45em; border: 5px solid #ccc;
|
||||||
|
padding: 0.8em; background: white; }
|
||||||
|
.entries { list-style: none; margin: 0; padding: 0; }
|
||||||
|
.entries li { margin: 0.8em 1.2em; }
|
||||||
|
.entries li h2 { margin-left: -1em; }
|
||||||
|
.add-entry { font-size: 0.9em; border-bottom: 1px solid #ccc; }
|
||||||
|
.add-entry dl { font-weight: bold; }
|
||||||
|
.metanav { text-align: right; font-size: 0.8em; padding: 0.3em;
|
||||||
|
margin-bottom: 1em; background: #fafafa; }
|
||||||
|
.flash { background: #CEE5F5; padding: 0.5em;
|
||||||
|
border: 1px solid #AACBE2; }
|
||||||
|
.error { background: #F0D6D6; padding: 0.5em; }
|
||||||
|
/#.button { border-top: 2px solid #a3ceda;
|
||||||
|
border-left: 2px solid #a3ceda;
|
||||||
|
border-right: 2px solid #4f6267;
|
||||||
|
border-bottom: 2px solid #4F6267;
|
||||||
|
padding: 1px 20px !important;
|
||||||
|
font-size: 14px !important;
|
||||||
|
background-color: #CEE5F5;
|
||||||
|
font-weight: bold;
|
||||||
|
color: #2d525d; }
|
||||||
|
#/
|
||||||
|
.container { width: 500px; clear: both;}
|
||||||
28
AoM_Service/AoM_Configs/webapp/templates/debug.html
Executable file
28
AoM_Service/AoM_Configs/webapp/templates/debug.html
Executable file
@@ -0,0 +1,28 @@
|
|||||||
|
{% extends "header.html" %}
|
||||||
|
{% block body %}
|
||||||
|
<h2>Form Elements</h2><br />
|
||||||
|
<table>
|
||||||
|
{% for key, value in query.items() %}
|
||||||
|
<tr>
|
||||||
|
<th> {{ key }} </th>
|
||||||
|
<td> {{ value }} </td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</table><br/>
|
||||||
|
<p>
|
||||||
|
{{ query.alert_name }}
|
||||||
|
</p>
|
||||||
|
<h2>Rendered Config File</h2><br />
|
||||||
|
<p>{{ file_path }}</p>
|
||||||
|
<p>
|
||||||
|
{% for line in file_contents %}
|
||||||
|
<div>{{ line|safe }}</div>
|
||||||
|
{% endfor %}
|
||||||
|
</p>
|
||||||
|
<br />
|
||||||
|
<form action="{{ url_for('re_build', alert_id=query.alert_name) }}" id="re_build" method="post">
|
||||||
|
<p>
|
||||||
|
<input type="submit" id="submit" class="btn btn-primary" value="Return to Form?">
|
||||||
|
</p>
|
||||||
|
</form>
|
||||||
|
{% endblock %}
|
||||||
6
AoM_Service/AoM_Configs/webapp/templates/error.html
Executable file
6
AoM_Service/AoM_Configs/webapp/templates/error.html
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
{% extends "header.html" %}
|
||||||
|
{% block body %}
|
||||||
|
<h1>Error Rendering config:</h1>
|
||||||
|
<p>{{ message }}</p>
|
||||||
|
<p><a href="{{ url_for('index') }}">Return to Creation Page?</a></p>
|
||||||
|
{% endblock %}
|
||||||
67
AoM_Service/AoM_Configs/webapp/templates/header.html
Executable file
67
AoM_Service/AoM_Configs/webapp/templates/header.html
Executable file
@@ -0,0 +1,67 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta http-equiv="X-UA-COMPATIBLE" content="IE=edge">
|
||||||
|
<meta name="viewport" content="width=device-width, intial-scale=1">
|
||||||
|
<title>Alerting On Metrics Yaml Builder</title>
|
||||||
|
<link rel=stylesheet type=text/css href="{{ url_for('static', filename='bootstrap.min.css') }}">
|
||||||
|
<link rel="stylesheet" type=text/css href="{{ url_for('static', filename='style.css') }}">
|
||||||
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
|
||||||
|
<script src="{{ url_for('static', filename='bootstrap.min.js') }}"></script>
|
||||||
|
<script type="text/javascript">
|
||||||
|
function dynInput(cbox) {
|
||||||
|
console.log(cbox)
|
||||||
|
if (cbox.checked) {
|
||||||
|
var input = document.createElement("input");
|
||||||
|
input.type = "text";
|
||||||
|
input.id = cbox.name + "_list";
|
||||||
|
input.name = cbox.name + "_list";
|
||||||
|
document.getElementById("insertinputs_" + cbox.name).appendChild(input);
|
||||||
|
} else {
|
||||||
|
document.getElementById(cbox.name + "_list").remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function dynEnable(cbox) {
|
||||||
|
console.log(cbox);
|
||||||
|
var theId = "#" + cbox.name + "_list";
|
||||||
|
console.log(theId);
|
||||||
|
if (cbox.checked){
|
||||||
|
$(theId)[0].disabled = false;
|
||||||
|
} else {
|
||||||
|
$(theId)[0].disabled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function dynThreshold(cbox) {
|
||||||
|
var theId = "#" + cbox.name + "_threshold";
|
||||||
|
if (cbox.checked){
|
||||||
|
$(theId)[0].disabled = false;
|
||||||
|
} else {
|
||||||
|
$(theId)[0].disabled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function forceLower(strInput){
|
||||||
|
strInput.value=strInput.value.toLowerCase().replace(" ","_");
|
||||||
|
}
|
||||||
|
|
||||||
|
function forceComma(strInput){
|
||||||
|
strInput.value=strInput.value.replace(" ",",");
|
||||||
|
}
|
||||||
|
|
||||||
|
function forcePositive(strInput){
|
||||||
|
if (parseInt(strInput.value) <= 1) {
|
||||||
|
strInput.value = 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class=page>
|
||||||
|
{% block body %}{% endblock %}
|
||||||
966
AoM_Service/AoM_Configs/webapp/templates/index.html
Executable file
966
AoM_Service/AoM_Configs/webapp/templates/index.html
Executable file
@@ -0,0 +1,966 @@
|
|||||||
|
{% extends "header.html" %}
|
||||||
|
{% block body %}
|
||||||
|
<form action="{{url_for('index')}}" id="builder" method="post" class="form-horizontal">
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-sm-12">
|
||||||
|
<h3 class="text-center">Alert Meta</h3>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Alert Name -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label for="alert_name" class="control-label">Alert Name:</label>
|
||||||
|
</div>
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#alertidModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="alertidModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="myModalLabel">Alert Name</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>The alert name acts as both the name of the .yaml file and the id for the alert. The
|
||||||
|
alert name becomes part of what shows up in the title / subject when an alert is
|
||||||
|
triggered</p>
|
||||||
|
<p>Picking an alert name that already exists will overwrite the .yaml configuration file so
|
||||||
|
be aware of what you choose</p>
|
||||||
|
<p>The Alert name is also how this alert will show up in Victorops, Slack and Email
|
||||||
|
(Depending on what options you choose for the Alerting</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="text" id="alert_name" class="form-control" name="alert_name" value="{{ alert_name }}"
|
||||||
|
onkeyup="return forceLower(this);">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Check Interval -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="interval">Check Interval: </label>
|
||||||
|
</div>
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#intervalModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="intervalModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="checkInterval">Check Interval</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>The check interval is how often the check will run the query (in seconds) and measure the
|
||||||
|
results</p>
|
||||||
|
<p>Anything less than 30 seconds will automatically be bumped up
|
||||||
|
to 30 seconds. This is due to the fact that metrics are collected every 30 seconds, so
|
||||||
|
checking more often than this would just result in the same values returned from the
|
||||||
|
query
|
||||||
|
as nothing would have changed yet</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="number" id="interval" class="form-control" name="interval" value="{{ interval }}">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Upper Critical Threshold -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="criticalUpperThreshold">Upper Critical Threshold: </label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#criticalUpperThresholdModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="criticalUpperThresholdModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="criticalUpperThresholdTitle">Critical Threshold</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>This is a Floating Point or Int that when the results back from the query exceeds this
|
||||||
|
number, a critical alert will trigger.</p>
|
||||||
|
<p>Only Critical Alerts will also trigger emails and slack alerts (if set)</p>
|
||||||
|
<p>Your query needs to be simplified down to just one or two
|
||||||
|
values per grouping (A start and end metric). The alerting system will look at all
|
||||||
|
values per grouping and check if any of the values are over the threshold to send out an
|
||||||
|
alert</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-7">
|
||||||
|
<input type="number" class="form-control" id="criticalUpperThreshold" name="critical_upper_threshold"
|
||||||
|
value="{{ critical_upper_threshold }}"
|
||||||
|
step="0.01"
|
||||||
|
onkeypress="validate(event)">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Lower Critical Threshold -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="criticalLowerThreshold">Lower Critical Threshold: </label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#criticalLowerThresholdModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="criticalLowerThresholdModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="criticalLowerThresholdTitle">Lower Critical Threshold</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>This is a Floating Point or Int that when the results back from the query drops below this
|
||||||
|
number, a critical alert will trigger.</p>
|
||||||
|
<p>Only Critical Alerts will also trigger emails and slack alerts (if set)</p>
|
||||||
|
<p>Your query needs to be simplified down to just one or two
|
||||||
|
values per grouping (A start and end metric). The alerting system will look at all
|
||||||
|
values per grouping and check if any of the values are over the threshold to send out an
|
||||||
|
alert</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-7">
|
||||||
|
<input type="number" class="form-control" id="lower_criticalThreshold" name="critical_lower_threshold"
|
||||||
|
value="{{ critical_lower_threshold }}"
|
||||||
|
step="0.01"
|
||||||
|
onkeypress="validate(event)">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Upper Warning Threshold -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="warningUpperThreshold">Upper Warning Threshold: </label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#warningUpperThresholdModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="warningUpperThresholdModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="warningUpperThresholdTitle">Upper Warning Threshold</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>This is a Floating Point or Int that when the results back from the query exceeds this
|
||||||
|
number, a warning alert will trigger.</p>
|
||||||
|
<p>Warnings will not trigger Email or Slack alerts (if set)</p>
|
||||||
|
<p>Your query needs to be simplified down to just one or two
|
||||||
|
values per grouping (A start and end metric). The alerting system will look at all
|
||||||
|
values per grouping and check if any of the values are over the threshold to send out an
|
||||||
|
alert</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-addon">
|
||||||
|
{% if warning_upper_threshold %}
|
||||||
|
{% set warning_upper_checked='checked' %}
|
||||||
|
{% else %}
|
||||||
|
{% set warning_upper_disabled='disabled' %}
|
||||||
|
{% endif %}
|
||||||
|
<input type="checkbox" name="warning_upper" id="warning_upper" aria-label="..." onclick="dynThreshold(this);" {{
|
||||||
|
warning_upper_checked }}>
|
||||||
|
</span>
|
||||||
|
<input type="number" name="warning_upper_threshold" class="form-control" id="warning_upper_threshold"
|
||||||
|
value="{{ warning_upper_threshold }}"
|
||||||
|
aria-label="..." step="0.01" {{ warning_upper_disabled }}>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Lower Warning Threshold -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="warningLowerThreshold">Lower Warning Threshold: </label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#warningLowerThresholdModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="warningLowerThresholdModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="warningLowerThresholdTitle">Lower Warning Threshold</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>This is a Floating Point or Int that when the results back from the query drops below this
|
||||||
|
number, a warning alert will trigger.</p>
|
||||||
|
<p>Warnings will not trigger Email or Slack alerts (if set)</p>
|
||||||
|
<p>Your query needs to be simplified down to just one or two
|
||||||
|
values per grouping (A start and end metric). The alerting system will look at all
|
||||||
|
values per grouping and check if any of the values are over the threshold to send out an
|
||||||
|
alert</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-addon">
|
||||||
|
{% if warning_lower_threshold %}
|
||||||
|
{% set warning_lower_checked='checked' %}
|
||||||
|
{% else %}
|
||||||
|
{% set warning_lower_disabled='disabled' %}
|
||||||
|
{% endif %}
|
||||||
|
<input type="checkbox" name="warning_lower" id="warning_lower" aria-label="..." onclick="dynThreshold(this);" {{
|
||||||
|
warning_lower_checked }}>
|
||||||
|
</span>
|
||||||
|
<input type="number" name="warning_lower_threshold" class="form-control" id="warning_lower_threshold"
|
||||||
|
value="{{ warning_lower_threshold }}"
|
||||||
|
aria-label="..." step="0.01" {{ warning_lower_disabled }}>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Occurrences -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="occurrences_threshold">Frequency: </label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#occurrencesModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="occurrencesModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="occurrencesTitle">Frequency</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>The occurrences value, when set, will determine how many times the alert has to exceed the
|
||||||
|
threshold in order for an alert to trigger.</p>
|
||||||
|
<p>This is particularly useful for metrics that can be spikey and resolve quickly,
|
||||||
|
using occurrences allows you to only be alerted when a spike is no longer spiking but
|
||||||
|
maintaining the rate over the period of time</p>
|
||||||
|
<p>This is compared once every interval, so if your alert is set to 5 minutes, with a
|
||||||
|
occurrences of 3, you'd have to have the threshold exceeded for 15 minutes before any
|
||||||
|
alerts
|
||||||
|
are sent out.</p>
|
||||||
|
<p>The occurrences value is optional, and if not enabled, the service assumes that after 1 query
|
||||||
|
exceeding the threshold is enough to trigger alerts. So in this way having an occurrences value
|
||||||
|
set
|
||||||
|
to 1 or not enabled does the same thing.</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-addon">
|
||||||
|
{% if occurrences_threshold and occurrences_threshold is number and occurrences_threshold > 1 %}
|
||||||
|
{% set occurrences_checked='checked' %}
|
||||||
|
{% else %}
|
||||||
|
{% set occurrences_disabled='disabled' %}
|
||||||
|
{% endif %}
|
||||||
|
<input type="checkbox" name="occurrences" id="occurrences" aria-label="..."
|
||||||
|
onclick="dynThreshold(this);" {{
|
||||||
|
occurrences_checked }}>
|
||||||
|
</span>
|
||||||
|
<input type="number" name="occurrences_threshold" class="form-control" id="occurrences_threshold"
|
||||||
|
value="{{ occurrences_threshold }}"
|
||||||
|
aria-label="..." step="1" min="2" {{ occurrences_disabled }} onkeyup="return forcePositive(this);">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Tags -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="tags">Tags:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#tagsModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="tagsModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="tagsTitle">Tags</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>A comma seperated list of tags used to include in the alert subject</p>
|
||||||
|
<p>In the event of an alert, the tags will be used to look up distinctive
|
||||||
|
information and
|
||||||
|
include as part of the alert</p>
|
||||||
|
<p>For example including the dc tag in an alert means that if an alert occurs, the
|
||||||
|
alerting
|
||||||
|
system will look up the dc value from the returned query and included it as part
|
||||||
|
of the
|
||||||
|
alert subject</p>
|
||||||
|
<p>These are the same tag values used to build kiarosdb queries</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="text" name="tags" id="tags" class="form-control" value="{{ tags }}" ,
|
||||||
|
onkeyup="return forceComma(this);">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-sm-12">
|
||||||
|
<h3 class="text-center">Notifications</h3>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- VictorOps Alerts -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="vo">VictorOps Alert:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#voModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="voModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="voTitle">Victor Ops Alert List</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>A comma seperated list of victorops routing keys</p>
|
||||||
|
<p>In the event of an alert, the Ids listed here will recieve a victorops alert</p>
|
||||||
|
<p>If the checkbox isn't selected, when generating the .yaml config the values
|
||||||
|
listed will
|
||||||
|
be ignored</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-addon">
|
||||||
|
{% if vo=="on" %}
|
||||||
|
{% set vo_checked='checked' %}
|
||||||
|
{% else %}
|
||||||
|
{% set vo_disabled='disabled' %}
|
||||||
|
{% endif %}
|
||||||
|
<input type="checkbox" name="vo" id="vo" aria-label="..." onclick="dynEnable(this);" {{ vo_checked
|
||||||
|
}}>
|
||||||
|
</span>
|
||||||
|
<input type="text" class="form-control" name="vo_list" id="vo_list" aria-label="..."
|
||||||
|
value="{{ vo_list }}" onkeyup="return forceComma(this);" {{ vo_disabled }}>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Email Alerts -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="email">Email Alert:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#emailModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="emailModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="emailTitle">Email Alert List</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>A comma seperated list of email names to send alerts to</p>
|
||||||
|
<p>In the event of an alert, the names listed here will recieve an email alert</p>
|
||||||
|
<p>The alerting system appends an @qualtrics.com to the names listed here, so there
|
||||||
|
is no
|
||||||
|
need to include the @domain as it's assumed all alerting emails would go to a
|
||||||
|
qualtrics
|
||||||
|
address</p>
|
||||||
|
<p>Also the SMTP server can only send to @qualtrics addresses anyways</p>
|
||||||
|
<p>For example sending an email to both netops and devops on an alert would be <b>devops,netops</b>
|
||||||
|
in the text box.</p>
|
||||||
|
<p>If the checkbox isn't selected, when generating the .yaml config the values
|
||||||
|
listed will
|
||||||
|
be ignored</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-addon">
|
||||||
|
{% if email=="on" %}
|
||||||
|
{% set email_checked='checked' %}
|
||||||
|
{% else %}
|
||||||
|
{% set email_disabled='disabled' %}
|
||||||
|
{% endif %}
|
||||||
|
<input type="checkbox" name="email" id="email" aria-label="..." onclick="dynEnable(this);" {{
|
||||||
|
email_checked }}>
|
||||||
|
</span>
|
||||||
|
<input type="text" name="email_list" class="form-control" id="email_list"
|
||||||
|
value="{{ email_list }}"
|
||||||
|
aria-label="..." onkeyup="return forceComma(this);" {{ email_disabled }}>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Slack Alert List -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="slack">Slack Alert:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#slackModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="slackModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="slackTitle">Slack Alert List</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>A comma seperated list of slack names to send alerts to</p>
|
||||||
|
<p>In the event of an alert, the names listed here will recieve a slack alert from a
|
||||||
|
slackbot</p>
|
||||||
|
<p>You must include a @ for direct message alerts and # for channel alerts</p>
|
||||||
|
<p>For example, if the DevOps team wanted to get an alert in slack, the value in the
|
||||||
|
text
|
||||||
|
box would be <b>#devops</b>.
|
||||||
|
If I wanted to also include a direct message as well then the value would be
|
||||||
|
<b>#devops,@codyc</b></p>
|
||||||
|
<p>Don't troll people with your metric alerts bombing peopls slack, it's unkind</p>
|
||||||
|
<p>If the checkbox isn't selected, when generating the .yaml config the values
|
||||||
|
listed will
|
||||||
|
be ignored</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<span class="input-group-addon">
|
||||||
|
{% if slack=="on" %}
|
||||||
|
{% set slack_checked='checked' %}
|
||||||
|
{% else %}
|
||||||
|
{% set slack_disabled='disabled' %}
|
||||||
|
{% endif %}
|
||||||
|
<input type="checkbox" name="slack" id="slack" aria-label="..." onclick="dynEnable(this);" {{
|
||||||
|
slack_checked }}>
|
||||||
|
</span>
|
||||||
|
<span id="insertinputs_slack"></span>
|
||||||
|
<input type="text" name="slack_list" class="form-control" id="slack_list"
|
||||||
|
value="{{ slack_list }}"
|
||||||
|
aria-label="..." onkeyup="return forceComma(this);" {{ slack_disabled }}>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-sm-12">
|
||||||
|
<h3 class="text-center">Dashboard</h3>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Dashboard URL -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="query">Dashboard URL:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#dashboardModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="dashboardModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="dashboardTitle">Dashboard URL</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Most queries are built based on some dashboard already built in grafana</p>
|
||||||
|
<p>By including the URL to that dashboard, the oncall engineer recieving the alert
|
||||||
|
will be
|
||||||
|
able to click the link in the alert and get a better picture of what this alert
|
||||||
|
is and
|
||||||
|
and how it relates to the datacenter</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-7">
|
||||||
|
<input type="text" name="url" id="url" class="form-control" value="{{ url }}">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-sm-12">
|
||||||
|
<h3 class="text-center">Kairosdb Query</h3>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- KairosDB Query -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="query">KariosDB Query:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#queryModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="queryModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="queryTitle">KariosDB Query</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Paste in your KariosDB Query that you have already worked out.</p>
|
||||||
|
<p>You can generate your query by going to the <a
|
||||||
|
href="http://kairosdb-metrics.service.eng.consul:8080/" target="_blank">KariosDB
|
||||||
|
UI
|
||||||
|
in eng</a></p>
|
||||||
|
<p>When generating your metric you will want to get the return values down to just 1
|
||||||
|
or 2
|
||||||
|
results per grouping. This can be done by sending the query to the MAX or MIN
|
||||||
|
aggregators (depending on your logic needs) as the last aggregator in the
|
||||||
|
query</p>
|
||||||
|
<p>You will also want to include a time offset, typically 5 minutes is used for when
|
||||||
|
to
|
||||||
|
start (as from 5 minutes ago to now). Setting the MAX aggregator to this value
|
||||||
|
is
|
||||||
|
usually typical</p>
|
||||||
|
<p>Once you have generated your query and it's returning the results you expect,
|
||||||
|
click the
|
||||||
|
<b>Show Query</b> button on the kairosDB UI and copy the results into this field
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-7">
|
||||||
|
<textarea name="kairosdb_query" id="kairosdb_query" class="form-control" rows="12" cols="50">{{ kairosdb_query }}</textarea>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-sm-12">
|
||||||
|
<h3 class="text-center">Prometheus Query</h3>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Prometheus URL -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label for="prometheus_url" class="control-label">Prometheus URL:</label>
|
||||||
|
</div>
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#prometheusurlModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="prometheusurlModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="myModalLabel">Prometheus URL</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>URL for the prometheus server</p>
|
||||||
|
<p>Shared, production Prometheus URLs are currently:
|
||||||
|
<ul>
|
||||||
|
<li>http://big-trickster.service.eng.consul:9090</li>
|
||||||
|
</ul>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="text" id="prometheus_url" class="form-control" name="prometheus_url" value="{{ prometheus_url }}"
|
||||||
|
onkeyup="return forceLower(this);">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Prometheus Query -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="query">Prometheus Query:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#prometheusQueryModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="prometheusQueryModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="queryTitle">Prometheus Query</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Paste in your Prometheus Query that you have already worked out.</p>
|
||||||
|
<p>You can generate your query by going to the url of your prometheus endpoint. Eng Vis plans on adding a smart router for this in the future so all instances will be exposed via a single smart proxy, but for now you'll need to know the name. </p><p><a
|
||||||
|
href="http://big-trickster.service.eng.consul:9090/graph" target="_blank">Prometheus Host Metrics
|
||||||
|
UI
|
||||||
|
in eng</a>
|
||||||
|
</p><p>
|
||||||
|
<a
|
||||||
|
href="http://big-trickster.service.eng.consul:9090/graph" target="_blank">Prometheus StatsD and other Metrics
|
||||||
|
UI
|
||||||
|
in eng</a></p>
|
||||||
|
<p>When creating a query, keep in mind a single value returned is gonna be the most
|
||||||
|
useful
|
||||||
|
, so stuff like "topk(1,yourmetrics)" are gonna be good choices. However, if
|
||||||
|
your query has multiple return values AOM will use last value.</p>
|
||||||
|
<p>So if you use a step/duration of 60 and a timspan of 300 between start
|
||||||
|
and
|
||||||
|
end you'll get back 5 values and the last will be used.
|
||||||
|
</p>
|
||||||
|
<p><a href="https://prometheus.io/docs/prometheus/latest/querying/functions/" target="_blank">Prometheus Functions</a></p>
|
||||||
|
<p>
|
||||||
|
<a href="https://prometheus.io/docs/prometheus/latest/querying/operators/" target="_blank">Prometheus Operators</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-7">
|
||||||
|
<textarea name="prometheus_query" id="prometheus_query" class="form-control" rows="12" cols="50">{{ prometheus_query }}</textarea>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Start Time -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="start_time">Start Time: </label>
|
||||||
|
</div>
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#startTimeModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="startTimeModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="startTime">Start Time</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>This should be a relative time in seconds like '-600' for 10m, defaults to '-300'</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="text" id="start_time" class="form-control" name="start_time" value="{{ start_time }}">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- End Time -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="end_time">End Time: </label>
|
||||||
|
</div>
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#endTimeModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="endTimeModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="endTime">End Time</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>This can be 'now' (default) or some relative offset like '-30' in seconds</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="text" id="end_time" class="form-control" name="end_time" value="{{ end_time }}">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-sm-12">
|
||||||
|
<h3 class="text-center">Actions</h3>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Load Config File -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="loadFile">Load Config From File:</label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#loadModal">info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="loadModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="loadTitle">Load Config from file</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>Load a config already generated to file into the UI</p>
|
||||||
|
|
||||||
|
<p>This is handy when you need to make minor changes to a query, or add additional
|
||||||
|
alerting
|
||||||
|
values or change thresholds. Or if you are just terrified of yaml.</p>
|
||||||
|
<p>Hit the drop down to see a list of all alert configs (the names generated from
|
||||||
|
the values
|
||||||
|
used in the Alert Name field) Hit the Go and the config will load into all the
|
||||||
|
fields</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-7">
|
||||||
|
<div class="input-group">
|
||||||
|
<select name="loadFile" id="loadFile" class="form-control">
|
||||||
|
<option value="" selected></option>
|
||||||
|
{% for f in alert_list %}
|
||||||
|
<option value="{{ f }}">{{ f }}</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
<span class="input-group-btn">
|
||||||
|
<input type="submit" name="generate" id="submitFiles" class="btn btn-primary" value="Go">
|
||||||
|
</span>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Submit Form -->
|
||||||
|
<div class="form-group">
|
||||||
|
<div class="col-sm-4">
|
||||||
|
<label class="control-label" for="submit">Generate YAML:</label>
|
||||||
|
</div>
|
||||||
|
<div class="col-sm-1">
|
||||||
|
<!-- Button trigger modal -->
|
||||||
|
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#generateModal">
|
||||||
|
info
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Modal -->
|
||||||
|
<div class="modal fade" id="generateModal" tabindex="-1" role="dialog"
|
||||||
|
aria-labelledby="myModalLabel">
|
||||||
|
<div class="modal-dialog" role="document">
|
||||||
|
<div class="modal-content">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h4 class="modal-title" id="generateTitle">Generate Alert Config</h4>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<p>When you are ready to take the values in the form and generate a alert config
|
||||||
|
.yaml file,
|
||||||
|
hit the button</p>
|
||||||
|
<p>This will generate a .yaml file based on the alert name. So for example if one
|
||||||
|
was to
|
||||||
|
have the value <b>mcp_errors_per_dc</b> as an alert name, the resulting file
|
||||||
|
would be
|
||||||
|
<b>mcp_errors_per_dc.yaml</b></p>
|
||||||
|
<p>This <b>will</b> overwrite a .yaml file if the alert name is the same as an
|
||||||
|
already
|
||||||
|
existing file</p>
|
||||||
|
<p>If there are any errors generating the config, the resulting page will include
|
||||||
|
the error
|
||||||
|
message and give you the ability to return back to this page with your form
|
||||||
|
saved</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-footer">
|
||||||
|
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-7">
|
||||||
|
<input type="submit" id="submit" name='generate' class='btn btn-primary' value="generate"
|
||||||
|
class="button">
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</form>
|
||||||
|
{% endblock %}
|
||||||
4
AoM_Service/AoM_Configs/webapp/templates/layout.html
Executable file
4
AoM_Service/AoM_Configs/webapp/templates/layout.html
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
{% extends "header.html" %}
|
||||||
|
{% block body %}
|
||||||
|
<h2>Complete all values in the form below</h2>
|
||||||
|
{% endblock %}
|
||||||
69
AoM_Service/AoM_Configs/webapp/views.py
Executable file
69
AoM_Service/AoM_Configs/webapp/views.py
Executable file
@@ -0,0 +1,69 @@
|
|||||||
|
# views.py
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from flask import session
|
||||||
|
|
||||||
|
from library.logger import AlertLogging
|
||||||
|
from webapp import app, render_template, request, render
|
||||||
|
|
||||||
|
logger = AlertLogging('aom')
|
||||||
|
logger.start()
|
||||||
|
logger.start_log_file("logs/aom_service.log")
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/', methods=['GET', 'POST'])
|
||||||
|
def index():
|
||||||
|
logger.debug("Request Method: {}".format(request.method))
|
||||||
|
if request.method == 'GET':
|
||||||
|
# GET BLOB OF FILES
|
||||||
|
service_config = yaml.load(open('service.yaml', 'r').read())
|
||||||
|
alert_list = sorted([os.path.splitext(os.path.basename(x))[0] for x in
|
||||||
|
glob.glob(service_config['alert_folder'] + "/*.yaml")])
|
||||||
|
if 'yaml_config' in session:
|
||||||
|
return render_template('index.html', **json.loads(session['yaml_config']), alert_list=alert_list)
|
||||||
|
else:
|
||||||
|
return render_template('index.html', alert_list=alert_list)
|
||||||
|
elif request.method == 'POST':
|
||||||
|
logger.info("Got a form")
|
||||||
|
if 'go' in request.form['generate'].lower():
|
||||||
|
return re_build(request.form['loadFile'])
|
||||||
|
yaml_config = dict()
|
||||||
|
ret = ''
|
||||||
|
try:
|
||||||
|
for field_name, value in request.form.items():
|
||||||
|
yaml_config[field_name] = value
|
||||||
|
code, ret = render.render_config(yaml_config)
|
||||||
|
assert code == 0
|
||||||
|
return render_template('debug.html', query=yaml_config,
|
||||||
|
file_path='alert_configs/{}.yaml'.format(yaml_config['alert_name']),
|
||||||
|
file_contents=ret.split('\n'))
|
||||||
|
except AssertionError:
|
||||||
|
session['yaml_config'] = json.dumps(yaml_config)
|
||||||
|
return render_template('error.html', message="Failed to render to file: {}".format(ret))
|
||||||
|
except Exception as e:
|
||||||
|
return render_template('error.html', message=str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/build/<alert_id>', methods=['POST'])
|
||||||
|
def re_build(alert_id):
|
||||||
|
# READ IN CONFIG FROM ID
|
||||||
|
config = render.render_yaml(alert_id)
|
||||||
|
service_config = yaml.load(open('service.yaml', 'r').read())
|
||||||
|
alert_list = sorted([os.path.splitext(os.path.basename(x))[0] for x in
|
||||||
|
glob.glob(service_config['alert_folder'] + "/*.yaml")])
|
||||||
|
return render_template('index.html', **config, alert_list=alert_list)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/debug/")
|
||||||
|
def toggle_debug():
|
||||||
|
if logger.debug_handler:
|
||||||
|
logger.stop_debug()
|
||||||
|
logger.info("Debug Stopped")
|
||||||
|
else:
|
||||||
|
logger.start_debug()
|
||||||
|
logger.debug("Debug Started")
|
||||||
|
return index()
|
||||||
3
AoM_Service/AoM_Configs/webapp_requirements.txt
Executable file
3
AoM_Service/AoM_Configs/webapp_requirements.txt
Executable file
@@ -0,0 +1,3 @@
|
|||||||
|
requests
|
||||||
|
pyaml
|
||||||
|
flask
|
||||||
8
AoM_Service/README.md
Executable file
8
AoM_Service/README.md
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
# IMPORTANT NOTICE:
|
||||||
|
|
||||||
|
Alert configurations have been moved to [AlertOnMetrics]
|
||||||
|
(https://gitlab-app.eng.qops.net/engvis/AlertOnMetricsConfigs).
|
||||||
|
|
||||||
|
This will allow more flexibility to the project. Merge requests will
|
||||||
|
be automatically validated, merged and deployed if it passes the
|
||||||
|
validation stage.
|
||||||
20
AoM_Service/alert_configs/engine.yaml
Executable file
20
AoM_Service/alert_configs/engine.yaml
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_engine_failing
|
||||||
|
service: core
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
suppressed_occurrences_threshold: 24
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['fuel']
|
||||||
18
AoM_Service/alert_configs/fuel.yaml
Executable file
18
AoM_Service/alert_configs/fuel.yaml
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_fuellevel_low
|
||||||
|
service: fuel
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
20
AoM_Service/alert_configs/lightspeed.yaml
Executable file
20
AoM_Service/alert_configs/lightspeed.yaml
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
|
||||||
|
service: captain
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
suppressed_occurrences_threshold: 48
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['core']
|
||||||
20
AoM_Service/alert_configs/shields.yaml
Executable file
20
AoM_Service/alert_configs/shields.yaml
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_shields_unavailable
|
||||||
|
service: core
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
suppressed_occurrences_threshold: 54
|
||||||
|
start_time: '-60'
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['fuel']
|
||||||
81
AoM_Service/aom_service.py
Executable file
81
AoM_Service/aom_service.py
Executable file
@@ -0,0 +1,81 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
""" Alert On Metrics Project"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import multiprocessing
|
||||||
|
import json
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from time import time, sleep
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
from sanic import Sanic, response
|
||||||
|
from library.args import get_service_args
|
||||||
|
from library.config import glob_the_configs
|
||||||
|
from library.logger import AlertLogging
|
||||||
|
from library.service import Service
|
||||||
|
|
||||||
|
LOG = AlertLogging('aom')
|
||||||
|
LOG.start()
|
||||||
|
LOG.start_log_file("logs/aom_service.log")
|
||||||
|
|
||||||
|
APP = Sanic()
|
||||||
|
SERVICE_JOB = multiprocessing.Value('i', 0)
|
||||||
|
NUM_JOBS = multiprocessing.Value('i', 0)
|
||||||
|
LEADERSHIP = multiprocessing.Value('i', 0)
|
||||||
|
LEADER_STATUS = None
|
||||||
|
LEADER_TIME = None
|
||||||
|
CONSUL_URL = None
|
||||||
|
LEADER_OVERRIDE = None
|
||||||
|
HOSTNAME = None
|
||||||
|
SERVICE_CONFIG = None
|
||||||
|
|
||||||
|
@APP.route("/")
|
||||||
|
async def index(_):
|
||||||
|
"""
|
||||||
|
Return total number of jobs
|
||||||
|
"""
|
||||||
|
global NUM_JOBS
|
||||||
|
return response.json({"job_count": NUM_JOBS.value})
|
||||||
|
|
||||||
|
@APP.route('/healthcheck')
|
||||||
|
async def health(request):
|
||||||
|
"""
|
||||||
|
Flask healthcheck so that consul and friends work, see this as a service
|
||||||
|
Returns:
|
||||||
|
json object of status: ok
|
||||||
|
"""
|
||||||
|
LOG.debug("healthcheck")
|
||||||
|
service_process = multiprocessing.Process(target=start_service, \
|
||||||
|
args=(LOG, SERVICE_CONFIG['alert_reload_interval']), \
|
||||||
|
name="service", daemon=False)
|
||||||
|
# TRY TO START SERVICE, IF LEADER AND NOT RUNNING
|
||||||
|
if SERVICE_JOB.value == 0:
|
||||||
|
LOG.info("Starting alerts background job")
|
||||||
|
SERVICE_JOB.value += 1
|
||||||
|
service_process.start()#start_service(log)
|
||||||
|
return response.json({"status": "ok"}, 200)
|
||||||
|
|
||||||
|
|
||||||
|
def start_service(log, reload_interval):
|
||||||
|
s = Service(log, reload_interval, HOSTNAME, SERVICE_CONFIG)
|
||||||
|
s.start()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# GET ARGS AND START LOGGING
|
||||||
|
ARGS = get_service_args()
|
||||||
|
logging.setLoggerClass(AlertLogging)
|
||||||
|
LOG.info("Starting Service")
|
||||||
|
# GET SERVICE CONFIG
|
||||||
|
LEADER_OVERRIDE = ARGS['override']
|
||||||
|
HOSTNAME = ARGS['hostname']
|
||||||
|
SERVICE_CONFIG = yaml.safe_load(open('service.yaml', 'r').read())
|
||||||
|
if ARGS['alert_configs'] is not None:
|
||||||
|
SERVICE_CONFIG['alert_folder'] = ARGS['alert_configs']
|
||||||
|
if ARGS['alert_routing_lookup'] is not None:
|
||||||
|
SERVICE_CONFIG['alert_routing_lookup'] = ARGS['alert_routing_lookup']
|
||||||
|
# SET CONSUL URL FOR LEADER CHECK
|
||||||
|
CONSUL_URL = SERVICE_CONFIG['consul_url']
|
||||||
|
# START THE MAIN SERVICE
|
||||||
|
APP.run(host="0.0.0.0", port=ARGS['port'])
|
||||||
121
AoM_Service/aom_test.py
Executable file
121
AoM_Service/aom_test.py
Executable file
@@ -0,0 +1,121 @@
|
|||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
service_config = yaml.load(open('service.yaml', 'r').read())
|
||||||
|
kairos_url = service_config['kairosdb_url'] + "api/v1/datapoints/"
|
||||||
|
kairos_query = kairos_url + "query"
|
||||||
|
metrics_list = []
|
||||||
|
status1 = "RECOVERY"
|
||||||
|
status2 = "WARNING"
|
||||||
|
status3 = "CRITICAL"
|
||||||
|
|
||||||
|
json_string1 = """{"name": "aom_test_metric","datapoints": """
|
||||||
|
json_string2 = ""","tags": {"host": "aom_host","data_center": "AOM"},"ttl": 500}"""
|
||||||
|
|
||||||
|
# WRITE ALERT CONFIG FILE
|
||||||
|
|
||||||
|
alert_file = {'alerts': {'sensu': {'slack': 'aom_test_channel'}},
|
||||||
|
'critical_lower_threshold': 100,
|
||||||
|
'critical_upper_threshold': 5000,
|
||||||
|
'id': 'test_metric',
|
||||||
|
'interval': 30,
|
||||||
|
'occurrences_threshold': 1,
|
||||||
|
'query': {'cache_time': 0,
|
||||||
|
'end_relative': {'unit': 'seconds', 'value': '30'},
|
||||||
|
'metrics': [{'name': 'aom_test_metric', 'tags': {}}],
|
||||||
|
'start_relative': {'unit': 'seconds', 'value': '60'}},
|
||||||
|
'tags': {},
|
||||||
|
'url': 'AOM_TESTING',
|
||||||
|
'warning_lower_threshold': 1000,
|
||||||
|
'warning_upper_threshold': 2000}
|
||||||
|
|
||||||
|
query_intro = """{
|
||||||
|
"metrics": [
|
||||||
|
{
|
||||||
|
"tags": {
|
||||||
|
"alert": [
|
||||||
|
"test_metric"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"name": "telegraf.aom_"""
|
||||||
|
|
||||||
|
query_outro = """_value",
|
||||||
|
"aggregators": [
|
||||||
|
{
|
||||||
|
"name": "sum",
|
||||||
|
"align_sampling": true,
|
||||||
|
"sampling": {
|
||||||
|
"value": "9",
|
||||||
|
"unit": "minutes"
|
||||||
|
},
|
||||||
|
"align_start_time": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"cache_time": 0,
|
||||||
|
"start_relative": {
|
||||||
|
"value": "8",
|
||||||
|
"unit": "minutes"
|
||||||
|
}
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
with open('alert_configs/test.yaml', 'w') as yaml_file:
|
||||||
|
yaml.dump(alert_file, yaml_file, default_flow_style=False)
|
||||||
|
except Exception:
|
||||||
|
print("Error writing alert config file")
|
||||||
|
return False
|
||||||
|
|
||||||
|
now = int(time.time() * 1000)
|
||||||
|
metrics_list.append([now, 1501])
|
||||||
|
now += 32000
|
||||||
|
metrics_list.append([now, 202])
|
||||||
|
now += 32000
|
||||||
|
metrics_list.append([now, 23])
|
||||||
|
now += 32000
|
||||||
|
metrics_list.append([now, 1504])
|
||||||
|
now += 32000
|
||||||
|
metrics_list.append([now, 2005])
|
||||||
|
now += 32000
|
||||||
|
metrics_list.append([now, 5006])
|
||||||
|
now += 32000
|
||||||
|
metrics_list.append([now, 1507])
|
||||||
|
full_string = json_string1 + str(metrics_list) + json_string2
|
||||||
|
try:
|
||||||
|
ret = requests.post(kairos_url, data=json.dumps(json.loads(full_string)), timeout=200)
|
||||||
|
assert ret.status_code == 204, "Wrong status code received from KairosDB"
|
||||||
|
except AssertionError as e:
|
||||||
|
print("Error: {}".format(str(e)))
|
||||||
|
except Exception as e:
|
||||||
|
print("Problem talking to KairosDB: {}".format(str(e)))
|
||||||
|
return False
|
||||||
|
print("Metrics sent to KairosDB. Check alerts in the #aom_test_channel in Slack")
|
||||||
|
time.sleep(360)
|
||||||
|
|
||||||
|
try:
|
||||||
|
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status1 + query_outro)), timeout=200)
|
||||||
|
print("Recovery {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
|
||||||
|
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong RECOVERY result"
|
||||||
|
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status2 + query_outro)), timeout=200)
|
||||||
|
print("Warning {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
|
||||||
|
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 2, "Wrong WARNING result"
|
||||||
|
ret = requests.post(kairos_query, data=json.dumps(json.loads(query_intro + status3 + query_outro)), timeout=200)
|
||||||
|
print("Critical {}".format(dict(ret.json())['queries'][0]['results'][0]['values'][0][1]))
|
||||||
|
assert dict(ret.json())['queries'][0]['results'][0]['values'][0][1] == 4, "Wrong CRITICAL result"
|
||||||
|
except AssertionError as e:
|
||||||
|
print("Error: {}".format(str(e)))
|
||||||
|
except Exception as e:
|
||||||
|
print("Problem getting results from KairosDB: {}".format(str(e)))
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
0
AoM_Service/library/__init__.py
Executable file
0
AoM_Service/library/__init__.py
Executable file
66
AoM_Service/library/alert_config.py
Executable file
66
AoM_Service/library/alert_config.py
Executable file
@@ -0,0 +1,66 @@
|
|||||||
|
class Alert_Config():
|
||||||
|
def __init__(self, yaml_config) :
|
||||||
|
if not 'alert_tags' in yaml_config :
|
||||||
|
yaml_config['alert_tags'] = {}
|
||||||
|
self.id = str(yaml_config['id'])
|
||||||
|
self.yaml_config = yaml_config
|
||||||
|
self.tags = {}
|
||||||
|
self.state = {}
|
||||||
|
|
||||||
|
def type(self) :
|
||||||
|
if 'type' in self.yaml_config :
|
||||||
|
return self.yaml_config['type']
|
||||||
|
return 'kairos'
|
||||||
|
|
||||||
|
def tags(self) :
|
||||||
|
if 'tags' in self.yaml_config :
|
||||||
|
return self.yaml_config['tags']
|
||||||
|
return []
|
||||||
|
|
||||||
|
def occurrences(self) :
|
||||||
|
if 'occurrences_threshold' in self.yaml_config :
|
||||||
|
return self.yaml_config['occurrences_threshold']
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def url(self) :
|
||||||
|
if 'url' in self.yaml_config :
|
||||||
|
return self.yaml_config['url']
|
||||||
|
from os import environ
|
||||||
|
return environ['AOM_GRAFANA_URL'] + self.id
|
||||||
|
|
||||||
|
def get_level(self, key) :
|
||||||
|
if not key in self.state :
|
||||||
|
self.state[key] = None
|
||||||
|
return self.state[key]
|
||||||
|
|
||||||
|
def set_level(self, key, value) :
|
||||||
|
self.state[key] = value
|
||||||
|
|
||||||
|
def get_for_tags(self, key) :
|
||||||
|
if not key in self.tags :
|
||||||
|
self.tags[key] = 0
|
||||||
|
return self.tags[key]
|
||||||
|
|
||||||
|
def set_for_tags(self, key, value) :
|
||||||
|
if not key in self.tags :
|
||||||
|
self.tags[key] = 0
|
||||||
|
self.tags[key] = value
|
||||||
|
|
||||||
|
def init_for_tags(self, key) :
|
||||||
|
for k in [key, key+"_count"] :
|
||||||
|
if not key in self.tags :
|
||||||
|
self.set_for_tags(key, 0)
|
||||||
|
self.set_for_tags(key+"_noresult", 0)
|
||||||
|
|
||||||
|
def get_threshold(isUpper, isWarning) :
|
||||||
|
if isUpper and isWarning :
|
||||||
|
return self.try_get_yaml_config('warning_upper_threshold')
|
||||||
|
if isUpper and not isWarning :
|
||||||
|
return self.try_get_yaml_config('critical_upper_threshold')
|
||||||
|
elif not isUpper and isWarning :
|
||||||
|
return self.try_get_yaml_config('warning_lower_threshold')
|
||||||
|
elif not isUpper and not isWarning :
|
||||||
|
return self.try_get_yaml_config('critical_lower_threshold')
|
||||||
|
|
||||||
|
def try_get_yaml_config(self, key) :
|
||||||
|
return self.yaml_config[key] if key in self.yaml_config else None, key in self.yaml_config
|
||||||
36
AoM_Service/library/alert_config_list.py
Executable file
36
AoM_Service/library/alert_config_list.py
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
from alert_config import Alert_Config
|
||||||
|
|
||||||
|
class Alert_Config_List() :
|
||||||
|
def __init__(self, alert_configs=None) :
|
||||||
|
self.hash = {}
|
||||||
|
if alert_configs :
|
||||||
|
self.add(alert_configs)
|
||||||
|
|
||||||
|
def __getitem__(self, k) :
|
||||||
|
return self.hash[k]
|
||||||
|
|
||||||
|
def __len__(self) :
|
||||||
|
return len(self.hash)
|
||||||
|
|
||||||
|
def add(self, alert_config) :
|
||||||
|
if isinstance(alert_config, Alert_Config):
|
||||||
|
self.hash[alert_config.id] = alert_config
|
||||||
|
elif isinstance(alert_config, list) :
|
||||||
|
for a in alert_config :
|
||||||
|
self.add(a)
|
||||||
|
elif isinstance(alert_config, Alert_Config_List) :
|
||||||
|
for k in alert_config.hash :
|
||||||
|
self.add(alert_config.hash[k])
|
||||||
|
else :
|
||||||
|
raise Exception("unexpected type added to Alert_Config_List")
|
||||||
|
|
||||||
|
def compare(self, other) :
|
||||||
|
if not other :
|
||||||
|
other = Alert_Config_List()
|
||||||
|
self_keys = self.hash.keys()
|
||||||
|
other_keys = other.hash.keys()
|
||||||
|
added = other_keys - self_keys
|
||||||
|
removed = self_keys - other_keys
|
||||||
|
intersection = [i for i in self_keys if i in other_keys]
|
||||||
|
modified = [ i for i in intersection if self[i] != other[i] ]
|
||||||
|
return set(added), set(removed), set(modified)
|
||||||
163
AoM_Service/library/args.py
Executable file
163
AoM_Service/library/args.py
Executable file
@@ -0,0 +1,163 @@
|
|||||||
|
# Contians the arg parser options.
|
||||||
|
"""Contains the arg parser options."""
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def get_builder_args():
|
||||||
|
"""
|
||||||
|
Gets the arguments passed in to the aom_builder main call
|
||||||
|
|
||||||
|
:return: parser object
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Generates a valid yaml file "
|
||||||
|
"for alerting on metrics. If you are "
|
||||||
|
"familiar with the yaml structure for an "
|
||||||
|
"alert you don't have to use this builder,"
|
||||||
|
" it's just convenient")
|
||||||
|
parser.add_argument('-q', '--query', help="The Kariosdb query string to "
|
||||||
|
"use")
|
||||||
|
parser.add_argument(
|
||||||
|
'-i', '--interval', type=int, default=60, help="The "
|
||||||
|
"interval that the check will This value is in seconds")
|
||||||
|
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The "
|
||||||
|
"upper threshold is the value that when reached will "
|
||||||
|
"cause an depending on the threshold logic. "
|
||||||
|
"Use in conjunction with lower threshold to define a "
|
||||||
|
"normal band.")
|
||||||
|
parser.add_argument(
|
||||||
|
'-b',
|
||||||
|
'--lowerthreshold',
|
||||||
|
help="The lower threshold is the value that when reached will cause an "
|
||||||
|
"alert depending on the threshold logic"
|
||||||
|
"Use in conjunction with upper threshold to define a normal band.")
|
||||||
|
parser.add_argument(
|
||||||
|
'-m',
|
||||||
|
'--measure',
|
||||||
|
choices=[
|
||||||
|
'gt',
|
||||||
|
'lt',
|
||||||
|
'eq'],
|
||||||
|
help="The measure to use to compare the "
|
||||||
|
"threshold to the values of the alerts")
|
||||||
|
parser.add_argument(
|
||||||
|
'-a',
|
||||||
|
'--alert_config',
|
||||||
|
help='A valid Yaml representation of your alerting block')
|
||||||
|
parser.add_argument(
|
||||||
|
'-l',
|
||||||
|
'--log_level',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="The log level for the aom_builder run. "
|
||||||
|
"[0=Error, 1=Info, 2=Debug]")
|
||||||
|
parser.add_argument(
|
||||||
|
'-p',
|
||||||
|
'--port',
|
||||||
|
type=int,
|
||||||
|
default=8080,
|
||||||
|
help="The port to run the webapp on")
|
||||||
|
|
||||||
|
return args_to_dict(parser)
|
||||||
|
|
||||||
|
|
||||||
|
def get_tester_service_args():
|
||||||
|
"""
|
||||||
|
Gets arguments passed into aom_tester.py
|
||||||
|
Returns: parser object
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Parameters to start the alerting on metrics dummy tester "
|
||||||
|
"service")
|
||||||
|
parser.add_argument(
|
||||||
|
'-l',
|
||||||
|
'--log_level',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="The log level for the aom_service app"
|
||||||
|
"[0=Error, 1=Info, 2=Debug]")
|
||||||
|
parser.add_argument(
|
||||||
|
'-a',
|
||||||
|
'--alert_configs',
|
||||||
|
default=None,
|
||||||
|
help="If provided will override the folder location read from the "
|
||||||
|
"config with the value passed in. Is helpful for testing and "
|
||||||
|
"troubleshooting alerts")
|
||||||
|
parser.add_argument(
|
||||||
|
'--hostname',
|
||||||
|
help="If provided, will override the actual hostname check with this "
|
||||||
|
"value")
|
||||||
|
parser.add_argument(
|
||||||
|
'-p',
|
||||||
|
'--port',
|
||||||
|
type=int,
|
||||||
|
default=8080,
|
||||||
|
help="The port to run the webapp on")
|
||||||
|
return args_to_dict(parser)
|
||||||
|
|
||||||
|
|
||||||
|
def get_service_args():
|
||||||
|
"""
|
||||||
|
Gets arguments passed into aom_service.py
|
||||||
|
Returns: parser object
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Parameters to start the alerting on metrics service")
|
||||||
|
parser.add_argument(
|
||||||
|
'-l',
|
||||||
|
'--log_level',
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="The log level for the aom_service app"
|
||||||
|
"[0=Error, 1=Info, 2=Debug]")
|
||||||
|
parser.add_argument(
|
||||||
|
'-a',
|
||||||
|
'--alert_configs',
|
||||||
|
default=None,
|
||||||
|
help="If provided will override the folder location read from the "
|
||||||
|
"config with the value passed in. Is helpful for testing and "
|
||||||
|
"troubleshooting alerts")
|
||||||
|
parser.add_argument(
|
||||||
|
'--alert_routing_lookup',
|
||||||
|
default=None,
|
||||||
|
help="If provided will override the folder used to fetch the alerts "
|
||||||
|
"lookup configuration.")
|
||||||
|
parser.add_argument(
|
||||||
|
'-o',
|
||||||
|
'--override',
|
||||||
|
action='store_true',
|
||||||
|
help="Overrides the check leader election value")
|
||||||
|
parser.add_argument(
|
||||||
|
'--hostname',
|
||||||
|
help="If provided, will override the actual hostname check with this "
|
||||||
|
"value")
|
||||||
|
parser.add_argument(
|
||||||
|
'-p',
|
||||||
|
'--port',
|
||||||
|
type=int,
|
||||||
|
default=8080,
|
||||||
|
help="The port to run the webapp on")
|
||||||
|
return args_to_dict(parser)
|
||||||
|
|
||||||
|
|
||||||
|
def args_to_dict(parsed_args):
|
||||||
|
"""
|
||||||
|
Converts the argument parser object to a dict
|
||||||
|
Args:
|
||||||
|
parsed_args: Arg parser object
|
||||||
|
Returns:
|
||||||
|
Dictionary of arguments
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
arg_list = parsed_args.parse_args()
|
||||||
|
# RETURN A DICT OF ARGUMENTS
|
||||||
|
arg_dict = dict()
|
||||||
|
for val in vars(arg_list):
|
||||||
|
arg_dict[val] = getattr(arg_list, val)
|
||||||
|
return arg_dict
|
||||||
|
except argparse.ArgumentError:
|
||||||
|
parsed_args.print_help()
|
||||||
|
sys.exit(1)
|
||||||
226
AoM_Service/library/config.py
Executable file
226
AoM_Service/library/config.py
Executable file
@@ -0,0 +1,226 @@
|
|||||||
|
# config.py
|
||||||
|
"""Functions for loading alert configuration files"""
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import hashlib
|
||||||
|
import yaml
|
||||||
|
import requests
|
||||||
|
from serviceapp import service
|
||||||
|
|
||||||
|
# import logging
|
||||||
|
# logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def md5(fname):
|
||||||
|
"""Calculates md5 hash of a filename"""
|
||||||
|
hash_md5 = hashlib.md5()
|
||||||
|
with open(fname, "rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(4096), b""):
|
||||||
|
hash_md5.update(chunk)
|
||||||
|
return hash_md5.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_healthy_nodes_and_index(consul_url, hostname, logger):
|
||||||
|
"""Find AOM healthy nodes on consult"""
|
||||||
|
try:
|
||||||
|
# getting all registered nodes from consul
|
||||||
|
r = requests.get(
|
||||||
|
consul_url +
|
||||||
|
'/v1/catalog/service/alert-on-metrics',
|
||||||
|
timeout=60)
|
||||||
|
assert r.status_code == 200, "Failed to get back a 200 from consul catalog"
|
||||||
|
|
||||||
|
value = json.loads(r.text)
|
||||||
|
node_list = []
|
||||||
|
host_index = -1
|
||||||
|
for elem in value:
|
||||||
|
node_list.append(elem.get('Node'))
|
||||||
|
|
||||||
|
# Retrieving healthy nodes
|
||||||
|
healthy_nodes = []
|
||||||
|
for node in node_list:
|
||||||
|
r2 = requests.get(
|
||||||
|
consul_url +
|
||||||
|
'/v1/health/node/' +
|
||||||
|
node,
|
||||||
|
timeout=60)
|
||||||
|
assert r.status_code == 200, "Failed to get back a 200 from consul health"
|
||||||
|
healthcheck_list = json.loads(r2.text)
|
||||||
|
for check in healthcheck_list:
|
||||||
|
if (check.get('CheckID') == 'check_healthcheck_alert-on-metrics_alert-on-metrics' and
|
||||||
|
check.get('Status') == 'passing'):
|
||||||
|
healthy_nodes.append(node)
|
||||||
|
|
||||||
|
try:
|
||||||
|
healthy_nodes.sort()
|
||||||
|
host_index = healthy_nodes.index(hostname)
|
||||||
|
except ValueError:
|
||||||
|
logger.error("Host is not healthy")
|
||||||
|
except TimeoutError:
|
||||||
|
logger.error("Timed out connecting to Consul")
|
||||||
|
return host_index, len(healthy_nodes)
|
||||||
|
|
||||||
|
|
||||||
|
def distribute_configs(
|
||||||
|
filename,
|
||||||
|
host_index,
|
||||||
|
module,
|
||||||
|
logger):
|
||||||
|
"""Uses md5 of alert config to split the files among healthy servers"""
|
||||||
|
if module == 0:
|
||||||
|
logger.error("No healthy nodes for the service")
|
||||||
|
return False
|
||||||
|
if host_index == -1:
|
||||||
|
logger.error("Host is unhealthy")
|
||||||
|
return False
|
||||||
|
if int(md5(filename), 16) % module == host_index:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid(alert_config, logger):
|
||||||
|
"""Checks if alert has all required fields"""
|
||||||
|
try:
|
||||||
|
assert alert_config['alerts'], "No Alerts configured, this is a dead config"
|
||||||
|
assert alert_config['query'], "No Query, this is a dead config"
|
||||||
|
assert alert_config['interval'] >= 30, "Intervals less than 30 are invalid"
|
||||||
|
assert alert_config['id'], "Alert ID is empty, this is a dead config"
|
||||||
|
if alert_config.get('query_type') == 'prometheus':
|
||||||
|
assert isinstance(
|
||||||
|
alert_config['query'], str), "Invalid Prometheus query"
|
||||||
|
else:
|
||||||
|
assert isinstance(
|
||||||
|
alert_config['query'], dict), "Kairosdb Query string cannot be validated as proper JSON"
|
||||||
|
defined_tags = set(alert_config['query']['metrics'][0]['tags'].keys()).union(
|
||||||
|
{'', 'dc', 'fqdn'})
|
||||||
|
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
|
||||||
|
if 'group_by' in alert_config['query']['metrics'][0]:
|
||||||
|
defined_tags.update(
|
||||||
|
set(alert_config['query']['metrics'][0]['group_by'][0]['tags']))
|
||||||
|
# for undefined_tag in set(alert_config['tags']).difference(defined_tags):
|
||||||
|
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
|
||||||
|
# "prevent empty results".format(undefined_tag))
|
||||||
|
# OUR MINIMUM THRESHOLD NEED
|
||||||
|
assert 'critical_lower_threshold' in alert_config or 'critical_upper_threshold' in alert_config or \
|
||||||
|
'warning_lower_threshold' in alert_config or 'warning_upper_threshold' in alert_config, \
|
||||||
|
"Config must have at least one threshold set."
|
||||||
|
|
||||||
|
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING
|
||||||
|
# AFTER CRITICAL
|
||||||
|
if 'warning_lower_threshold' in alert_config and 'critical_lower_threshold' in alert_config:
|
||||||
|
assert alert_config['critical_lower_threshold'] < alert_config['warning_lower_threshold'], \
|
||||||
|
"Lower Critical must be less than Lower Warning"
|
||||||
|
if 'warning_upper_threshold' in alert_config and 'critical_upper_threshold' in alert_config:
|
||||||
|
assert alert_config['critical_upper_threshold'] > alert_config['warning_upper_threshold'], \
|
||||||
|
"Upper Critical must be greater than Upper Warning"
|
||||||
|
|
||||||
|
if 'lookup' in alert_config['alerts']:
|
||||||
|
assert 'default' in alert_config['alerts']['lookup'], 'No default alert configured for the lookup configuration'
|
||||||
|
assert 'lookup_file' in alert_config['alerts']['lookup'] or 'lookups' in alert_config['alerts'][
|
||||||
|
'lookup'], 'No lookup configured either in the alert configuration or in a separated file'
|
||||||
|
assert 'tags' in alert_config['alerts']['lookup'], 'No tags configured for the lookup configuration'
|
||||||
|
assert all(
|
||||||
|
isinstance(
|
||||||
|
tag, str) for tag in alert_config['alerts']['lookup']['tags']), 'Tags must be valid string'
|
||||||
|
|
||||||
|
# if 'occurrences_threshold' in alert_config:
|
||||||
|
# assert alert_config['occurrences_threshold'] >= 1, \
|
||||||
|
# "Having an occurrences value less than 2 is assumed and pointless to specify"
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Invalid config file: {}".format(str(e)))
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_alert_routing_lookup(alert_routing_lookup, alert, logger):
|
||||||
|
"""Check if routing lookup is properly configured"""
|
||||||
|
try:
|
||||||
|
assert alert_routing_lookup, "No lookup values configured, the configuration is empty."
|
||||||
|
for alert_routing in alert_routing_lookup:
|
||||||
|
assert 'alert' in alert_routing, "No alert defined for this configuration."
|
||||||
|
assert 'tags' in alert_routing, "No tags value defined for this configuration."
|
||||||
|
for tag in alert_routing['tags']:
|
||||||
|
assert tag in alert['alerts']['lookup']['tags'], "The tag {} is not part of the configuration".format(
|
||||||
|
tag)
|
||||||
|
assert all(isinstance(tag, str)
|
||||||
|
for tag in alert_routing['tags']), "Tags must be valid string"
|
||||||
|
except AssertionError as e:
|
||||||
|
logger.warning("Invalid alert routing config file: {}".format(str(e)))
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
def glob_the_configs(
|
||||||
|
config_path,
|
||||||
|
lookup_config_path,
|
||||||
|
consul_url,
|
||||||
|
hostname,
|
||||||
|
logger):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
config_path (string): relative path to the configs
|
||||||
|
consul_url (string): url to consul service
|
||||||
|
logger:
|
||||||
|
Returns:
|
||||||
|
List of configs
|
||||||
|
"""
|
||||||
|
invalid_configs = 0
|
||||||
|
alert_list = []
|
||||||
|
host_index, module = get_healthy_nodes_and_index(
|
||||||
|
consul_url, hostname, logger)
|
||||||
|
for config_file in glob.glob(config_path + "/**/*.yaml", recursive=True):
|
||||||
|
logger.debug("Found {} config".format(config_file))
|
||||||
|
# LOAD CONFIG
|
||||||
|
if distribute_configs(
|
||||||
|
config_file,
|
||||||
|
host_index,
|
||||||
|
module,
|
||||||
|
logger):
|
||||||
|
try:
|
||||||
|
alert = yaml.safe_load(open(config_file, 'rb').read())
|
||||||
|
if is_valid(alert, logger):
|
||||||
|
if 'lookup' in alert['alerts']:
|
||||||
|
alert_routing_lookup = []
|
||||||
|
is_valid_lookup = True
|
||||||
|
if 'lookup_file' in alert['alerts']['lookup']:
|
||||||
|
lookup_path = "{}/{}".format(
|
||||||
|
lookup_config_path, alert['alerts']['lookup']['lookup_file'])
|
||||||
|
if os.path.isfile(lookup_path):
|
||||||
|
alert_routing_lookup = yaml.safe_load(
|
||||||
|
open(lookup_path, 'rb').read())
|
||||||
|
else:
|
||||||
|
is_valid_lookup = False
|
||||||
|
else:
|
||||||
|
alert_routing_lookup = alert['alerts']['lookup']['lookups']
|
||||||
|
|
||||||
|
is_valid_lookup = is_valid_lookup and is_valid_alert_routing_lookup(
|
||||||
|
alert_routing_lookup, alert, logger)
|
||||||
|
|
||||||
|
if is_valid_lookup:
|
||||||
|
alerts_per_tags = {}
|
||||||
|
for alert_configuration in alert_routing_lookup:
|
||||||
|
key = []
|
||||||
|
for tag in alert['alerts']['lookup']['tags']:
|
||||||
|
key.append(
|
||||||
|
alert_configuration['tags'].get(tag))
|
||||||
|
alerts_per_tags[tuple(
|
||||||
|
key)] = alert_configuration['alert']
|
||||||
|
alert['alert_routing_lookup'] = alerts_per_tags
|
||||||
|
else:
|
||||||
|
invalid_configs += 1
|
||||||
|
continue
|
||||||
|
alert_list.append(alert)
|
||||||
|
else:
|
||||||
|
invalid_configs += 1
|
||||||
|
except BaseException as e:
|
||||||
|
logger.error("Error parsing {} config: {}".format(config_file, e))
|
||||||
|
logger.info("Invalid configs: {}".format(invalid_configs))
|
||||||
|
service.send_stat(
|
||||||
|
'invalid_configs',
|
||||||
|
invalid_configs,
|
||||||
|
dict(),
|
||||||
|
statprefix='aom')
|
||||||
|
logger.info("Loaded {} configs".format(len(alert_list)))
|
||||||
|
return alert_list
|
||||||
10
AoM_Service/library/job.py
Executable file
10
AoM_Service/library/job.py
Executable file
@@ -0,0 +1,10 @@
|
|||||||
|
import subprocess
|
||||||
|
|
||||||
|
class Job() :
|
||||||
|
def __init__(self, id, p):
|
||||||
|
self.id = id
|
||||||
|
self.p = p
|
||||||
|
|
||||||
|
def kill(self) :
|
||||||
|
subprocess.call(["/bin/kill", "-9", "{}".format(self.p.pid)])
|
||||||
|
self.p.join()
|
||||||
29
AoM_Service/library/job_list.py
Executable file
29
AoM_Service/library/job_list.py
Executable file
@@ -0,0 +1,29 @@
|
|||||||
|
from job import Job
|
||||||
|
|
||||||
|
class Job_List() :
|
||||||
|
def __init__(self) :
|
||||||
|
self.jobs = {}
|
||||||
|
|
||||||
|
def __getitem__(self, k) :
|
||||||
|
return self.jobs[k]
|
||||||
|
|
||||||
|
def __setitem__(self, k, v) :
|
||||||
|
self.jobs[k] = v
|
||||||
|
|
||||||
|
def __len__(self) :
|
||||||
|
return len(self.jobs)
|
||||||
|
|
||||||
|
def add(self, job) :
|
||||||
|
if isinstance(job, Job) :
|
||||||
|
self[job.id] = job
|
||||||
|
elif isinstance(job, Job_List) :
|
||||||
|
for j in job.jobs :
|
||||||
|
self.add(job[j])
|
||||||
|
else :
|
||||||
|
raise Exception("unexpected type added to Job_List")
|
||||||
|
|
||||||
|
def kill(self, id) :
|
||||||
|
if not id in self.jobs :
|
||||||
|
return
|
||||||
|
self[id].kill()
|
||||||
|
del(self.jobs[id])
|
||||||
122
AoM_Service/library/logger.py
Executable file
122
AoM_Service/library/logger.py
Executable file
@@ -0,0 +1,122 @@
|
|||||||
|
# logger.py
|
||||||
|
""" Logging configuration """
|
||||||
|
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import logging.handlers
|
||||||
|
import os
|
||||||
|
|
||||||
|
logging.getLogger('requests').setLevel(logging.ERROR)
|
||||||
|
logging.getLogger('urllib3').setLevel(logging.ERROR)
|
||||||
|
logging.getLogger('werkzeug').setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
|
||||||
|
class SingleLevelFilter(logging.Filter):
|
||||||
|
def __init__(self, passlevel, reject):
|
||||||
|
"""
|
||||||
|
initilizer(constructor) of the singlelevelfilter
|
||||||
|
@param passlevel (int) - the int value of the level of the log
|
||||||
|
@param reject (bool) - if true will return if the record level is
|
||||||
|
not equal to the passlevel
|
||||||
|
@return SingleLevelFilter object
|
||||||
|
@note Sets some object parameters
|
||||||
|
"""
|
||||||
|
self.passlevel = passlevel
|
||||||
|
self.reject = reject
|
||||||
|
|
||||||
|
def filter(self, record):
|
||||||
|
"""
|
||||||
|
Returns True/False depending on parameters
|
||||||
|
@param record (Log int) - the record that the filter belongs to
|
||||||
|
@return bool - True/False depending on what self.reject is set to and
|
||||||
|
what record.levelno and self.passlevel are set to
|
||||||
|
@note This causes either only logging of the exact same level to get
|
||||||
|
logged, or only logging other than the same level to get logged
|
||||||
|
"""
|
||||||
|
if self.reject:
|
||||||
|
return record.levelno != self.passlevel
|
||||||
|
return record.levelno == self.passlevel
|
||||||
|
|
||||||
|
|
||||||
|
class AlertLogging(logging.Logger):
|
||||||
|
"""
|
||||||
|
Class Object to handle the logging of the alert on metrics service
|
||||||
|
starts at Error level and can flip on (and add) an additional log file and
|
||||||
|
Debug logger as needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name):
|
||||||
|
"""
|
||||||
|
Inits the formaters and logger
|
||||||
|
"""
|
||||||
|
self.name = name
|
||||||
|
self.debug_formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - "
|
||||||
|
"%(message)s", "%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
self.standard_formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - [%(levelname)s] - %(message)s", "%m-%d %H:%M:%S")
|
||||||
|
logging.getLogger()
|
||||||
|
logging.Logger.__init__(self, name, logging.DEBUG)
|
||||||
|
logging.setLoggerClass(AlertLogging)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
info_handler = logging.StreamHandler()
|
||||||
|
info_handler.setLevel(logging.INFO)
|
||||||
|
info_handler.setFormatter(self.standard_formatter)
|
||||||
|
self.addHandler(info_handler)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def start_log_file(self, file_path, mode='a'):
|
||||||
|
"""
|
||||||
|
Creates a separate log file handler
|
||||||
|
Args:
|
||||||
|
file_path: path to the log file
|
||||||
|
mode: the type of mode to open the file handler with
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.log_path = file_path
|
||||||
|
work_folder = os.path.dirname(file_path)
|
||||||
|
if work_folder and not os.path.exists(work_folder):
|
||||||
|
os.makedirs(work_folder)
|
||||||
|
self.log_handler = logging.FileHandler(file_path, mode)
|
||||||
|
self.log_handler.setLevel(logging.WARNING)
|
||||||
|
self.log_handler.setFormatter(self.debug_formatter)
|
||||||
|
self.addHandler(self.log_handler)
|
||||||
|
|
||||||
|
def stop_log_file(self):
|
||||||
|
"""
|
||||||
|
Closes Log file and sets the handler to None
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.log_handler.close()
|
||||||
|
self.removeHandler(self.log_handler)
|
||||||
|
self.log_handler = None
|
||||||
|
|
||||||
|
def start_debug(self):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.debug_handler = logging.StreamHandler()
|
||||||
|
self.debug_handler.setLevel(logging.DEBUG)
|
||||||
|
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
|
||||||
|
self.debug_handler.setFormatter(self.debug_formatter)
|
||||||
|
self.addHandler(self.debug_handler)
|
||||||
|
|
||||||
|
def stop_debug(self):
|
||||||
|
"""
|
||||||
|
stop the debugger
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.removeHandler(self.debug_handler)
|
||||||
|
self.debug_handler = None
|
||||||
14
AoM_Service/library/process.py
Executable file
14
AoM_Service/library/process.py
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
class Process(multiprocessing.Process) :
|
||||||
|
def __init__(self, alert_config, config, logger, production_mode) :
|
||||||
|
multiprocessing.Process.__init__(
|
||||||
|
self,
|
||||||
|
target=self.get_target(),
|
||||||
|
args=(alert_config, config, logger, production_mode),
|
||||||
|
name=alert_config.id,
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_target(self) :
|
||||||
|
raise Exception("abstract method not implemented")
|
||||||
14
AoM_Service/library/process_factory.py
Executable file
14
AoM_Service/library/process_factory.py
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
import process_prometheus
|
||||||
|
import process_kairos
|
||||||
|
|
||||||
|
class Process_Factory() :
|
||||||
|
def __init__(self, config, logger, production) :
|
||||||
|
self.config = config
|
||||||
|
self.logger = logger
|
||||||
|
self.production = production
|
||||||
|
|
||||||
|
def build(self, alert_config) :
|
||||||
|
if alert_config.type() == "prometheus" :
|
||||||
|
return process_prometheus.Process_Prometheus(alert_config, self.config, self.logger, self.production)
|
||||||
|
else:
|
||||||
|
return process_kairos.Process_Kairos(alert_config, self.config, self.logger, self.production)
|
||||||
6
AoM_Service/library/process_kairos.py
Executable file
6
AoM_Service/library/process_kairos.py
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
import process
|
||||||
|
from serviceapp import service
|
||||||
|
|
||||||
|
class Process_Kairos(process.Process) :
|
||||||
|
def get_target(self) :
|
||||||
|
return service.check_kairosdb_alert
|
||||||
6
AoM_Service/library/process_prometheus.py
Executable file
6
AoM_Service/library/process_prometheus.py
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
import process
|
||||||
|
from serviceapp import service
|
||||||
|
|
||||||
|
class Process_Prometheus(process.Process) :
|
||||||
|
def get_target(self) :
|
||||||
|
return service.check_prometheus_alert
|
||||||
80
AoM_Service/library/service.py
Executable file
80
AoM_Service/library/service.py
Executable file
@@ -0,0 +1,80 @@
|
|||||||
|
import os
|
||||||
|
from alert_config_list import Alert_Config_List
|
||||||
|
from alert_config import Alert_Config
|
||||||
|
from job_list import Job_List
|
||||||
|
from job import Job
|
||||||
|
from process_factory import Process_Factory
|
||||||
|
from time import sleep
|
||||||
|
from config import glob_the_configs
|
||||||
|
from serviceapp import service
|
||||||
|
|
||||||
|
class Service() :
|
||||||
|
def __init__(self, logger, reload_interval, hostname, config):
|
||||||
|
self.alert_config_list = Alert_Config_List()
|
||||||
|
self.job_list = Job_List()
|
||||||
|
self.logger = logger
|
||||||
|
self.info = self.logger.info
|
||||||
|
self.error = self.logger.error
|
||||||
|
self.reload_interval = reload_interval
|
||||||
|
self.box_hostname = os.environ['HOSTNAME'] if hostname is None else hostname
|
||||||
|
self.production = not "TEST" in os.environ
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def start(self) :
|
||||||
|
self.info("Waiting 15s for Consul service to pass")
|
||||||
|
sleep(15)
|
||||||
|
while self.is_running() :
|
||||||
|
new_alert_config_list = self.get_new_alert_config_list()
|
||||||
|
self.purge_stale(new_alert_config_list)
|
||||||
|
self.create_upserted(new_alert_config_list)
|
||||||
|
self.alert_config_list = new_alert_config_list
|
||||||
|
total_jobs = len(self.job_list)
|
||||||
|
self.info("Total running jobs: {}".format(total_jobs))
|
||||||
|
service.send_stat('total_jobs', total_jobs, dict(), statprefix='aom')
|
||||||
|
sleep(self.reload_interval)
|
||||||
|
self.info("Exiting alerts")
|
||||||
|
self.purge_stale(Alert_Config_List())
|
||||||
|
|
||||||
|
def is_running(self) :
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_new_alert_config_list(self) :
|
||||||
|
try :
|
||||||
|
yaml_configs = self.parse_alert_config_files()
|
||||||
|
alert_configs = [Alert_Config(i) for i in yaml_configs]
|
||||||
|
return Alert_Config_List(alert_configs)
|
||||||
|
except Exception as e :
|
||||||
|
self.error("Failed to load config files: {}".format(e))
|
||||||
|
return []
|
||||||
|
|
||||||
|
def parse_alert_config_files(self) :
|
||||||
|
path = self.config['alert_folder']
|
||||||
|
routing = self.config['alert_routing_config']
|
||||||
|
consul = 'http://consul.service.consul:8500'
|
||||||
|
return glob_the_configs(path, routing, consul, self.box_hostname, self.logger)
|
||||||
|
|
||||||
|
def purge_stale(self, new_alert_config_list) :
|
||||||
|
_, removed_ids, modified_ids = self.alert_config_list.compare(new_alert_config_list)
|
||||||
|
stale_ids = removed_ids.union(modified_ids)
|
||||||
|
for stale_id in stale_ids :
|
||||||
|
self.job_list.kill(stale_id)
|
||||||
|
service.send_stat('removed_jobs', len(removed_ids), dict(), statprefix='aom')
|
||||||
|
self.info("Removed alert_configs: {}".format(removed_ids))
|
||||||
|
|
||||||
|
def create_upserted(self, new_alert_config_list) :
|
||||||
|
added_ids, _, modified_ids = self.alert_config_list.compare(new_alert_config_list)
|
||||||
|
upserted_ids = added_ids.union(modified_ids)
|
||||||
|
for id in upserted_ids :
|
||||||
|
p = self.spawn_process(new_alert_config_list[id])
|
||||||
|
j = Job(id, p)
|
||||||
|
self.job_list.add(j)
|
||||||
|
service.send_stat('new_jobs', len(added_ids), dict(), statprefix='aom')
|
||||||
|
service.send_stat('modified_jobs', len(modified_ids), dict(), statprefix='aom')
|
||||||
|
self.info("Added alert_configs: {}".format(added_ids))
|
||||||
|
self.info("Modified alert_configs: {}".format(added_ids))
|
||||||
|
|
||||||
|
def spawn_process(self, alert_config) :
|
||||||
|
process_factory = Process_Factory(self.config, self.logger, self.production)
|
||||||
|
process = process_factory.build(alert_config)
|
||||||
|
process.start()
|
||||||
|
return process
|
||||||
0
AoM_Service/library/serviceapp/__init__.py
Executable file
0
AoM_Service/library/serviceapp/__init__.py
Executable file
189
AoM_Service/library/serviceapp/alert.py
Executable file
189
AoM_Service/library/serviceapp/alert.py
Executable file
@@ -0,0 +1,189 @@
|
|||||||
|
from thresholds import Thresholds
|
||||||
|
|
||||||
|
class Alert() :
|
||||||
|
def __init__(self, alert_config, logger, tags, result, min_value, max_value) :
|
||||||
|
self.occurrences_breached = False
|
||||||
|
self.new_level_breached = False
|
||||||
|
self.info = logger.info
|
||||||
|
self.debug = logger.debug
|
||||||
|
self.warning = logger.warning
|
||||||
|
self.error = logger.error
|
||||||
|
self.alert_config = alert_config
|
||||||
|
self.thresholds = Thresholds(alert_config)
|
||||||
|
self.tags = ""
|
||||||
|
self.result = result
|
||||||
|
self.set_tags(tags)
|
||||||
|
self.alert_config.init_for_tags(alert_config.get_tags())
|
||||||
|
self.set_firing(min_value, max_value)
|
||||||
|
if availability :
|
||||||
|
self.info("Sending availability stat 1")
|
||||||
|
self.send_metrics(self.name(), 0 if self.level() == "CRITICAL" else 1, self.result, 'service_level')
|
||||||
|
|
||||||
|
def name(self) :
|
||||||
|
return "Metric: {} for {}".format(self.alert_config.id, self.get_tags())
|
||||||
|
|
||||||
|
def body(self) :
|
||||||
|
body = ""
|
||||||
|
if not self.get_firing() :
|
||||||
|
body = self.get_not_firing_body()
|
||||||
|
else :
|
||||||
|
body = self.get_is_firing_body()
|
||||||
|
self.debug("Alert {}->[{}]->{}, Occurrences={} of {}".format(
|
||||||
|
self.name(),
|
||||||
|
self.get_tags(),
|
||||||
|
self.level(),
|
||||||
|
self.get_occurrences(),
|
||||||
|
self.alert_config.occurrences(),
|
||||||
|
))
|
||||||
|
self.send_metrics(self.name(), self.level_code(), self.level())
|
||||||
|
# TODO
|
||||||
|
return body, md5(tag.encode('utf-8')).hexdigest()[:10]
|
||||||
|
|
||||||
|
def level(self) :
|
||||||
|
if not self.get_firing() :
|
||||||
|
return "RECOVERY"
|
||||||
|
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.CRITICAL)] :
|
||||||
|
return "CRITICAL"
|
||||||
|
if [t for t in self.thresholds.get_thresholds_matching(level=Thresholds.WARNING)] :
|
||||||
|
return "WARNING"
|
||||||
|
|
||||||
|
def level_code(self) :
|
||||||
|
level = self.level()
|
||||||
|
if level == "RECOVERY" :
|
||||||
|
return 0
|
||||||
|
elif level == "WARNING" :
|
||||||
|
return 0
|
||||||
|
elif level == "CRITICAL" :
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def get_not_firing_body(self) :
|
||||||
|
body = ""
|
||||||
|
body += get_not_firing_body_threshold()
|
||||||
|
body += get_not_firing_body_occurrences()
|
||||||
|
if not body :
|
||||||
|
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
|
||||||
|
return ""
|
||||||
|
return "GOOD: " + body
|
||||||
|
|
||||||
|
def get_not_firing_body_threshold(self) :
|
||||||
|
if self.result is None :
|
||||||
|
return ""
|
||||||
|
body = ""
|
||||||
|
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=True)
|
||||||
|
if not ok :
|
||||||
|
v, ok = self.alert_config.get_threshold(isUpper=True, isWarning=False)
|
||||||
|
if ok :
|
||||||
|
body += self.form("<", v)
|
||||||
|
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=True)
|
||||||
|
if not ok :
|
||||||
|
v, ok = self.alert_config.get_threshold(isUpper=False, isWarning=False)
|
||||||
|
if ok :
|
||||||
|
body += self.form(">", v)
|
||||||
|
return body
|
||||||
|
|
||||||
|
def get_not_firing_body_occurrences(self) :
|
||||||
|
if not self.get_occurrences() :
|
||||||
|
return ""
|
||||||
|
body = ""
|
||||||
|
if not self.result is None :
|
||||||
|
self.send_metrics(self.name(), 1, self.level())
|
||||||
|
else :
|
||||||
|
body += "{} RECOVERY due to no results found from query. Recommend you manually validate recovery\n{}".format(self.name(), self.alert_config.url())
|
||||||
|
self.set_occurrences(force=0)
|
||||||
|
return body
|
||||||
|
|
||||||
|
def get_is_firing_body(self) :
|
||||||
|
body = ""
|
||||||
|
if self.thresholds.get_breached(level=Thresholds.UPPER) :
|
||||||
|
body += self.form(">", self.upper_firing)
|
||||||
|
if self.thresholds.get_breached(level=Thresholds.LOWER) :
|
||||||
|
body += self.form("<", self.upper_firing)
|
||||||
|
if self.occurrences_breached :
|
||||||
|
self.debug("Value {} of {} for tag {} has occurred {} time(s) < threshold of {}".format(
|
||||||
|
self.value,
|
||||||
|
self.name(),
|
||||||
|
self.get_tags(),
|
||||||
|
self.get_occurrences(),
|
||||||
|
self.alert_config.occurrences(),
|
||||||
|
))
|
||||||
|
return ""
|
||||||
|
return body
|
||||||
|
|
||||||
|
def form(self, operator, static) :
|
||||||
|
return "{}\n{:.2f} {}= {}\n{}".format(
|
||||||
|
self.name(),
|
||||||
|
self.value,
|
||||||
|
operator,
|
||||||
|
static,
|
||||||
|
self.alert_config.url(),
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_tags(self, tags) :
|
||||||
|
if tags :
|
||||||
|
self.tags = tags
|
||||||
|
elif self.result :
|
||||||
|
import itertools
|
||||||
|
result_tags = [ self.result['tags'][x] for x in self.alert_config.get_tags() ]
|
||||||
|
chain = itertools.chain(result_tags)
|
||||||
|
sorted_list = sorted(list(chain))
|
||||||
|
self.tags = ", ".join(sorted_list)
|
||||||
|
if not self.tags :
|
||||||
|
self.tags = "instance"
|
||||||
|
|
||||||
|
def get_tags(self) :
|
||||||
|
return self.tags
|
||||||
|
|
||||||
|
def set_firing(self, min_value, max_value) :
|
||||||
|
self.thresholds = Thresholds(self.alert_config)
|
||||||
|
self.thresholds.set_breached(min_value, max_value)
|
||||||
|
self.set_occurrences()
|
||||||
|
self.set_new_level_breached()
|
||||||
|
self.send_metrics()
|
||||||
|
self.send_threshold_metrics()
|
||||||
|
|
||||||
|
def get_firing(self) :
|
||||||
|
return self.thresholds.get_breached() and self.occurrences_breached
|
||||||
|
|
||||||
|
def get_occurrences(self) :
|
||||||
|
tags = self.get_tags()
|
||||||
|
return self.alert_config.get_for_tags(tags)
|
||||||
|
|
||||||
|
def set_occurrences(self, force=None) :
|
||||||
|
previous_occurrences = self.get_occurrences()
|
||||||
|
if self.thresholds.get_breached() :
|
||||||
|
new_occurrences = previous_occurrences+1
|
||||||
|
self.alert_config.set_for_tags(self.get_tags(), new_occurrences)
|
||||||
|
self.occurrences_breached = self.alert_config.occurrences() <= new_occurrences
|
||||||
|
if force :
|
||||||
|
self.alert_config.set_for_tags(self.get_tags(), force)
|
||||||
|
self.alert_config.set_for_tags(self.get_tags()+"_count", force)
|
||||||
|
|
||||||
|
def send_metrics(self, *args, **kwargs) :
|
||||||
|
print("send_metrics not impl")
|
||||||
|
|
||||||
|
def set_new_level_breached(self) :
|
||||||
|
key = self.get_tags()
|
||||||
|
level = self.level()
|
||||||
|
previous_level = self.alert_config.get_level(key)
|
||||||
|
self.new_level_breached = level != previous_level
|
||||||
|
self.alert_config.set_level(key, level)
|
||||||
|
self.info("testInfo: {} {}".format(
|
||||||
|
"NEW" if self.new_level_breached else "EXISTING",
|
||||||
|
self.level(),
|
||||||
|
))
|
||||||
|
|
||||||
|
def get_new_level_breached(self) :
|
||||||
|
return self.new_level_breached
|
||||||
|
|
||||||
|
def send_threshold_metrics(self) :
|
||||||
|
# TODO
|
||||||
|
self.send_metrics(self.alert_config.id, self.value)
|
||||||
|
for level in [Thresholds.WARNING, Thresholds.CRITICAL] :
|
||||||
|
for end in [Thresholds.UPPER, Thresholds.LOWER] :
|
||||||
|
v, ok = self.alert_config.get_threshold(isUpper=level == Thresholds.UPPER, isWarning=end == Thresholds.WARNING)
|
||||||
|
if ok :
|
||||||
|
key = "{}_{}_threshold".format(
|
||||||
|
"upper" if level == Thresholds.UPPER else "lower",
|
||||||
|
"warning" if level == Thresholds.WARNING else "critical",
|
||||||
|
)
|
||||||
|
self.send_stat(key, v, {'id':self.name()})
|
||||||
13
AoM_Service/library/serviceapp/alert_factory.py
Executable file
13
AoM_Service/library/serviceapp/alert_factory.py
Executable file
@@ -0,0 +1,13 @@
|
|||||||
|
from alert import Alert
|
||||||
|
|
||||||
|
class Alert_Factory() :
|
||||||
|
def __init__(self, alert_config, logger) :
|
||||||
|
self.alert_config = alert_config
|
||||||
|
self.logger = logger
|
||||||
|
self.info = logger.info
|
||||||
|
self.warning = logger.warning
|
||||||
|
self.debug = logger.debug
|
||||||
|
self.error = logger.error
|
||||||
|
|
||||||
|
def build(self, minvalue, maxvalue, result, tags, availability, alert_tags) :
|
||||||
|
return Alert(self.alert_config, tags, result, minvalue, maxvalue)
|
||||||
83
AoM_Service/library/serviceapp/prom_api.py
Executable file
83
AoM_Service/library/serviceapp/prom_api.py
Executable file
@@ -0,0 +1,83 @@
|
|||||||
|
from datetime import datetime, timedelta
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class PromAPI:
|
||||||
|
def __init__(self, endpoint='http://127.0.0.1:9090/'):
|
||||||
|
"""
|
||||||
|
:param endpoint: address of
|
||||||
|
"""
|
||||||
|
self.endpoint = endpoint
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _to_timestamp(input_):
|
||||||
|
"""
|
||||||
|
Convert string input to UNIX timestamp for Prometheus
|
||||||
|
:param input_:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if type(input_) == datetime:
|
||||||
|
return input_.timestamp()
|
||||||
|
if input_ == 'now':
|
||||||
|
return datetime.utcnow().isoformat('T')
|
||||||
|
if type(input_) is str:
|
||||||
|
input_ = float(input_)
|
||||||
|
if type(input_) in [int, float]:
|
||||||
|
if input_ > 0:
|
||||||
|
return input_
|
||||||
|
if input_ == 0: # return now
|
||||||
|
return datetime.utcnow().isoformat('T')
|
||||||
|
if input_ < 0:
|
||||||
|
return (datetime.utcnow() + timedelta(seconds=input_)).isoformat('T')
|
||||||
|
#assert type(input_) == float
|
||||||
|
|
||||||
|
def query(self, query='prometheus_build_info'):
|
||||||
|
return self._get(
|
||||||
|
uri='/api/v1/query',
|
||||||
|
params=dict(
|
||||||
|
query=query
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def query_range(self, query='prometheus_build_info', start=-60, end='now', duration=60):
|
||||||
|
"""Get ser"""
|
||||||
|
params = {
|
||||||
|
'query': query
|
||||||
|
}
|
||||||
|
if end is not None:
|
||||||
|
params['end'] = self._to_timestamp(end) + 'Z'
|
||||||
|
if start:
|
||||||
|
params['start'] = self._to_timestamp(start) + 'Z'
|
||||||
|
if duration:
|
||||||
|
params['step'] = duration
|
||||||
|
print(params)
|
||||||
|
return self._get(
|
||||||
|
uri='/api/v1/query_range',
|
||||||
|
params=params
|
||||||
|
)
|
||||||
|
|
||||||
|
def series(self, match='prometheus_build_info', start=-86400, end='now'):
|
||||||
|
"""Get ser"""
|
||||||
|
params = {
|
||||||
|
'match[]': match
|
||||||
|
}
|
||||||
|
if end is not None:
|
||||||
|
params['end'] = self._to_timestamp(end) + 'Z'
|
||||||
|
if start:
|
||||||
|
params['start'] = self._to_timestamp(start) + 'Z'
|
||||||
|
print(params)
|
||||||
|
return self._get(
|
||||||
|
uri='/api/v1/series',
|
||||||
|
params=params
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get(self, uri, params, method='GET'):
|
||||||
|
url = urljoin(self.endpoint, uri)
|
||||||
|
assert method == 'GET'
|
||||||
|
result = requests.get(
|
||||||
|
url=url,
|
||||||
|
params=params
|
||||||
|
)
|
||||||
|
return result.json()
|
||||||
949
AoM_Service/library/serviceapp/service.py
Executable file
949
AoM_Service/library/serviceapp/service.py
Executable file
@@ -0,0 +1,949 @@
|
|||||||
|
""" Alert On Metrics functions"""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import smtplib
|
||||||
|
from email.mime.text import MIMEText
|
||||||
|
from socket import gaierror
|
||||||
|
from time import sleep
|
||||||
|
from hashlib import md5
|
||||||
|
import requests
|
||||||
|
from statsd import StatsClient
|
||||||
|
from serviceapp.prom_api import PromAPI
|
||||||
|
|
||||||
|
alert_status = [
|
||||||
|
'RECOVERY',
|
||||||
|
'WARNING',
|
||||||
|
'WARNING',
|
||||||
|
'CRITICAL',
|
||||||
|
'CRITICAL',
|
||||||
|
'CRITICAL']
|
||||||
|
|
||||||
|
|
||||||
|
def build_alert_message(alert, minvalue, maxvalue, result, logger,
|
||||||
|
availability, tag=None, alert_tags=None):
|
||||||
|
"""
|
||||||
|
Build the alert message
|
||||||
|
Args:
|
||||||
|
alert: the alert object that includes a tag definition
|
||||||
|
minvalue: the min value to test against the threshold
|
||||||
|
maxvalue: the max value to test against the threshold
|
||||||
|
result: the response back from kairosdb
|
||||||
|
logger (log object): does the logging
|
||||||
|
availability: Send availability stat 1
|
||||||
|
tag: If passed in will use this value for the tag instead of
|
||||||
|
getting it from the result object
|
||||||
|
alert_tags: the tags corresponding to the result, used if an
|
||||||
|
alert has to be triggered and a custom routing per tag is configured
|
||||||
|
Returns:
|
||||||
|
Alert message string
|
||||||
|
"""
|
||||||
|
# DEFAULT TO MAX VALUE AS THE VALUE WE WILL ALERT ON. LOGIC BELOW
|
||||||
|
# MAY CHANGE THIS.
|
||||||
|
# value = maxvalue
|
||||||
|
# # HANDLE THE CASE WHERE SOMEONE HAS NOT SPECIFIED ANY TAGS IN THEIR QUERY
|
||||||
|
# # (USUALLY A GLOBAL ALL-DC QUERY)
|
||||||
|
# if tag is None and result is not None:
|
||||||
|
# tag = ', '.join(sorted(list(itertools.chain(
|
||||||
|
# *[result['tags'][x] for x in alert['tags']]))))
|
||||||
|
# tag_count = tag + "_count"
|
||||||
|
# WE WILL USE THIS ONE LATER FOR TRACKING OCCURRENCES OF KAIROSDB NOT
|
||||||
|
# RETURNING RESULTS
|
||||||
|
# tag_noresult = tag + "_noresult"
|
||||||
|
# if not tag:
|
||||||
|
# tag = 'instance'
|
||||||
|
# logger.debug("No tag specified for alert {}".format(alert['id']))
|
||||||
|
# INSTEAD OF TRYING TO HANDLE LOGIC WHERE THESE ARE NOT IN THE OBJECT, PUT
|
||||||
|
# THEM IN AS SOON AS THEY ARE CREATED SO THAT ON FIRST RUN AN ALERT HAS ALL
|
||||||
|
# THE ALERT['alert_tags'][TAG] AND ALERT['alert_tags'][TAG_COUNT] NEEDED
|
||||||
|
# if 'alert_tags' not in alert:
|
||||||
|
# alert['alert_tags'] = {}
|
||||||
|
# if tag not in alert['alert_tags']:
|
||||||
|
# alert['alert_tags'][tag] = 0
|
||||||
|
# if tag_count not in alert['alert_tags']:
|
||||||
|
# alert['alert_tags'][tag_count] = 0
|
||||||
|
# IF WE HIT THIS FUNCTION THEN WE ALWAYS SET (OR RESET) THIS NORESULT
|
||||||
|
# COUNTER TO 0 IE. IF WE ARE HERE IT IMPLIES WE HAVE A RESULT FROM
|
||||||
|
# KAIROSDB OR WE ARE AT THE END OF A LONG PERIOD OF NORESULTS WHERE WE ARE
|
||||||
|
# CLEARING EVERYTHING OUT ANYWAY
|
||||||
|
# alert['alert_tags'][tag_noresult] = 0
|
||||||
|
|
||||||
|
# # FIRST FIND OUT WHAT THRESHOLDS ARE SET AND HAVE BEEN BREACHED
|
||||||
|
# upper_critical_threshold = None
|
||||||
|
# upper_warning_threshold = None
|
||||||
|
# lower_warning_threshold = None
|
||||||
|
# lower_critical_threshold = None
|
||||||
|
# upper_threshold = None
|
||||||
|
# lower_threshold = None
|
||||||
|
# is_warning_alarm = False
|
||||||
|
# is_critical_alarm = False
|
||||||
|
|
||||||
|
# # UPPER
|
||||||
|
# upper_threshold_exists = False
|
||||||
|
# upper_warning_threshold_breached = False
|
||||||
|
# upper_critical_threshold_breached = False
|
||||||
|
# if 'warning_upper_threshold' in alert:
|
||||||
|
# upper_threshold_exists = True
|
||||||
|
# upper_warning_threshold = alert['warning_upper_threshold']
|
||||||
|
# upper_threshold = upper_warning_threshold
|
||||||
|
# if maxvalue >= upper_warning_threshold:
|
||||||
|
# upper_warning_threshold_breached = True
|
||||||
|
# is_warning_alarm = True
|
||||||
|
# if 'critical_upper_threshold' in alert:
|
||||||
|
# upper_critical_threshold = alert['critical_upper_threshold']
|
||||||
|
# if not upper_threshold_exists:
|
||||||
|
# upper_threshold = upper_critical_threshold
|
||||||
|
# upper_threshold_exists = True
|
||||||
|
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT IS
|
||||||
|
# # OUR THRESHOLD FOR ALERTING
|
||||||
|
# if maxvalue >= alert['critical_upper_threshold']:
|
||||||
|
# upper_threshold = upper_critical_threshold
|
||||||
|
# upper_critical_threshold_breached = True
|
||||||
|
# is_critical_alarm = True
|
||||||
|
# upper_threshold_breached = (upper_warning_threshold_breached
|
||||||
|
# or upper_critical_threshold_breached)
|
||||||
|
|
||||||
|
# # LOWER
|
||||||
|
# lower_threshold_exists = False
|
||||||
|
# lower_warning_threshold_breached = False
|
||||||
|
# lower_critical_threshold_breached = False
|
||||||
|
# if 'warning_lower_threshold' in alert:
|
||||||
|
# lower_threshold_exists = True
|
||||||
|
# lower_warning_threshold = alert['warning_lower_threshold']
|
||||||
|
# lower_threshold = lower_warning_threshold
|
||||||
|
# if minvalue <= lower_warning_threshold:
|
||||||
|
# lower_warning_threshold_breached = True
|
||||||
|
# is_warning_alarm = True
|
||||||
|
# if 'critical_lower_threshold' in alert:
|
||||||
|
# lower_critical_threshold = alert['critical_lower_threshold']
|
||||||
|
# if not lower_threshold_exists:
|
||||||
|
# lower_threshold = lower_critical_threshold
|
||||||
|
# lower_threshold_exists = True
|
||||||
|
# # IF CONFIG HAS A CRITICAL THRESHOLD SET AND WE PASS THAT THEN THAT AS
|
||||||
|
# # OUR THRESHOLD FOR ALERTING
|
||||||
|
# if minvalue <= lower_critical_threshold:
|
||||||
|
# lower_threshold = lower_critical_threshold
|
||||||
|
# lower_critical_threshold_breached = True
|
||||||
|
# is_critical_alarm = True
|
||||||
|
# lower_threshold_breached = (lower_warning_threshold_breached or
|
||||||
|
# lower_critical_threshold_breached)
|
||||||
|
|
||||||
|
# # THIS HAS TO MEAN THERE IS A PROBLEM WITH THE ALERT CONFIG
|
||||||
|
# if lower_threshold is None and upper_threshold is None:
|
||||||
|
# logger.debug(
|
||||||
|
# "ERROR: alert {} does not have any thresholds set on {}".format(
|
||||||
|
# alert['id'], tag))
|
||||||
|
|
||||||
|
# # ON TO OCCURRENCES
|
||||||
|
# if 'occurrences_threshold' in alert:
|
||||||
|
# occurrences_threshold = alert['occurrences_threshold']
|
||||||
|
# else:
|
||||||
|
# occurrences_threshold = 1
|
||||||
|
|
||||||
|
# alert_entity = "Metric: {} for {}".format(alert['id'], tag)
|
||||||
|
|
||||||
|
# if 'url' not in alert:
|
||||||
|
# alert['url'] = os.environ['AOM_GRAFANA_URL'] + str(alert['id'])
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# PREPARE ALERT BODY STRING AND SET THE VALUE WE WILL USE TO ALERT WITH
|
||||||
|
# ====================
|
||||||
|
# alert_body = ''
|
||||||
|
# if upper_threshold_breached:
|
||||||
|
# alert_body = "{}\n{:.2f} >= {}\n{}".format(
|
||||||
|
# alert_entity, value, upper_threshold, alert['url'])
|
||||||
|
# if lower_threshold_breached:
|
||||||
|
# value = minvalue
|
||||||
|
# alert_body = "{}\n{:.2f} <= {}\n{}".format(
|
||||||
|
# alert_entity, value, lower_threshold, alert['url'])
|
||||||
|
|
||||||
|
# SEND SOME STATS OUT AT THIS POINT AS WE KNOW WHERE WE ARE NOW. SEND THE
|
||||||
|
# THRESHOLDS TOO SO THEY CAN BE GRAPHED
|
||||||
|
### BREEL TODO ###
|
||||||
|
# if result is not None:
|
||||||
|
# send_metrics(alert, value, result)
|
||||||
|
# if 'critical_upper_threshold' in alert:
|
||||||
|
# send_stat('upper_critical_threshold', upper_critical_threshold,
|
||||||
|
# {'id': alert['id']})
|
||||||
|
# if 'warning_upper_threshold' in alert:
|
||||||
|
# send_stat('upper_warning_threshold', upper_warning_threshold,
|
||||||
|
# {'id': alert['id']})
|
||||||
|
# if 'critical_lower_threshold' in alert:
|
||||||
|
# send_stat('lower_critical_threshold', lower_critical_threshold,
|
||||||
|
# {'id': alert['id']})
|
||||||
|
# if 'warning_lower_threshold' in alert:
|
||||||
|
# send_stat('lower_warning_threshold', lower_warning_threshold,
|
||||||
|
# {'id': alert['id']})
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# APPLY OUR LOGIC TO MAKE SOME DECISIONS
|
||||||
|
# ====================
|
||||||
|
#current_alert_status = alert_status[0]
|
||||||
|
#if not lower_threshold_breached and not upper_threshold_breached:
|
||||||
|
# # if result is not None:
|
||||||
|
# # if lower_threshold_exists and not upper_threshold_exists:
|
||||||
|
# # alert_body = "{}\n{:.2f} > {}\n{}".format(
|
||||||
|
# # alert_entity, value, lower_threshold, alert['url'])
|
||||||
|
# # logger.debug("GOOD: alert {} is higher than lower threshold {}"
|
||||||
|
# # "for value {} on tag {}".format(
|
||||||
|
# # alert['id'], lower_threshold, value, tag))
|
||||||
|
# # if upper_threshold_exists and not lower_threshold_exists:
|
||||||
|
# # alert_body = "{}\n{:.2f} < {}\n{}".format(
|
||||||
|
# # alert_entity, value, upper_threshold, alert['url'])
|
||||||
|
# # logger.debug("GOOD: alert {} is below the upper threshold {} "
|
||||||
|
# # "for value {} on tag {}".format(
|
||||||
|
# # alert['id'], upper_threshold, value, tag))
|
||||||
|
# # if upper_threshold_exists and lower_threshold_exists:
|
||||||
|
# # alert_body = "{}\n{} < {:.2f} < {}\n{}".format(
|
||||||
|
# # alert_entity, lower_threshold, value, upper_threshold,
|
||||||
|
# # alert['url'])
|
||||||
|
# # logger.debug("GOOD: alert {} is between thresholds {} and {} "
|
||||||
|
# # "for value {} on tag {}".format(
|
||||||
|
# # alert['id'], upper_threshold, lower_threshold,
|
||||||
|
# # value, tag))
|
||||||
|
# # CHECK AND SEE IF TAG LOGIC IS SET, IE. WE WERE PREVIOUSLY IN ALARM
|
||||||
|
# # STATE
|
||||||
|
# #if alert['alert_tags'][tag] > 0:
|
||||||
|
# # if result is not None:
|
||||||
|
# # send_metrics(alert, 1, result, current_alert_status)
|
||||||
|
# # logger.info(
|
||||||
|
# # "TestInfo: RECOVERY: Clearing values for [{}] - {}".format(
|
||||||
|
# # alert['id'], tag))
|
||||||
|
# # if result is None:
|
||||||
|
# # alert_body = ("{} RECOVERY due to no results found from "
|
||||||
|
# # "KairosDB query. Recommend you manually validate"
|
||||||
|
# # "recovery.\n{}").format(
|
||||||
|
# # alert_entity, alert['url'])
|
||||||
|
# # alert['alert_tags'][tag] = 0
|
||||||
|
# # alert['alert_tags'][tag_count] = 0
|
||||||
|
# # if availability:
|
||||||
|
# # logger.info("Sending availability stat 1")
|
||||||
|
# # send_metrics(alert, 1, result, 'service_level')
|
||||||
|
# #else:
|
||||||
|
# # # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
|
||||||
|
# # # CRITICAL) NEEDS TO BE FIRED
|
||||||
|
# # alert['alert_tags'][tag_count] = 0
|
||||||
|
# # if availability:
|
||||||
|
# # logger.info("Sending availability stat 1")
|
||||||
|
# # send_metrics(alert, 1, result, 'service_level')
|
||||||
|
# # return None
|
||||||
|
#else:
|
||||||
|
### BREEL WORKING HERE ###
|
||||||
|
# ====================
|
||||||
|
# SET KEY / VALUE FOR TAG ON ALERT
|
||||||
|
# 0 == No Alert
|
||||||
|
# 1 == Warning
|
||||||
|
# 2 == Existing Warning Alert
|
||||||
|
# 3 == New Critical
|
||||||
|
# 4+ == Existing Critical Alert
|
||||||
|
# ====================
|
||||||
|
# CHECK IF TAG_COUNT HAS BEEN SET, IF NOT SET IT, IF SO INCREMENT IT
|
||||||
|
# alert['alert_tags'][tag_count] += 1
|
||||||
|
|
||||||
|
# ALERT WONT FIRE UNLESS THE TAG_COUNT IS MORE THAN THE OCCURRENCES,
|
||||||
|
# THAT BEING EITHER 1 OR WHATEVER WAS SET ALERT HAS EXCEEDED
|
||||||
|
# OCCURRENCES SO RETURN IT
|
||||||
|
# TODO this doesnt belog in Alert.py
|
||||||
|
#if alert['alert_tags'][tag_count] >= occurrences_threshold:
|
||||||
|
# # >= 4 MEANS THIS IS A KNOWN CRITICAL, SO NO-OP
|
||||||
|
# if alert['alert_tags'][tag] < 4:
|
||||||
|
# if is_warning_alarm and not is_critical_alarm:
|
||||||
|
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO WARNING LEVEL
|
||||||
|
# if alert['alert_tags'][tag] == 0:
|
||||||
|
# # NEW WARNING
|
||||||
|
# alert['alert_tags'][tag] = 1
|
||||||
|
# logger.info("TestInfo: WARNING (NEW): {} - {}".format(
|
||||||
|
# alert['id'], tag))
|
||||||
|
# else:
|
||||||
|
# # EXISTING WARNING
|
||||||
|
# alert['alert_tags'][tag] = 2
|
||||||
|
# logger.info("TestInfo: WARNING (EXISTING): {} - {}".format(
|
||||||
|
# alert['id'], tag))
|
||||||
|
# if is_critical_alarm:
|
||||||
|
# # THIS HANDLES GOING FROM WARNING LEVEL TO CRITICAL LEVEL
|
||||||
|
# if (alert['alert_tags'][tag] == 1 or
|
||||||
|
# alert['alert_tags'][tag] == 2):
|
||||||
|
# alert['alert_tags'][tag] = 3
|
||||||
|
# logger.info("TestInfo: CRITICAL (WAS WARNING): {} - {}".format(
|
||||||
|
# alert['id'], tag))
|
||||||
|
# else:
|
||||||
|
# # THIS HANDLES GOING STRAIGHT FROM NORMAL TO CRITICAL
|
||||||
|
# # LEVEL
|
||||||
|
# if alert['alert_tags'][tag] < 3:
|
||||||
|
# # NEW CRITICAL
|
||||||
|
# alert['alert_tags'][tag] = 3
|
||||||
|
# logger.info("TestInfo: CRITICAL (NEW): {} - {}".format(
|
||||||
|
# alert['id'], tag))
|
||||||
|
# else:
|
||||||
|
# # EXISTING CRITICAL
|
||||||
|
# alert['alert_tags'][tag] = 4
|
||||||
|
# logger.info("TestInfo: CRITICAL (EXISTING): {} - {}".format(
|
||||||
|
# alert['id'], tag))
|
||||||
|
# RECORD THE FACT THAT SOMETHING IS STILL IN ALARM STATE IN METRICS
|
||||||
|
# EVEN IF NOT ACTIVELY ALERTING ON IT
|
||||||
|
# #if is_critical_alarm:
|
||||||
|
# #current_alert_status = alert_status[3]
|
||||||
|
# #send_metrics(alert, 2, result, current_alert_status)
|
||||||
|
# #if availability:
|
||||||
|
# # logger.info("Sending availability stat 0")
|
||||||
|
# # send_metrics(alert, 0, result, 'service_level')
|
||||||
|
# #if is_warning_alarm and not is_critical_alarm:
|
||||||
|
# #current_alert_status = alert_status[1]
|
||||||
|
# #send_metrics(alert, 1, result, current_alert_status)
|
||||||
|
# #if availability:
|
||||||
|
# # logger.info("Sending availability stat 1")
|
||||||
|
# # send_metrics(alert, 1, result, 'service_level')
|
||||||
|
# logger.debug("{} alert for value {} of {} for tag {} has occurred "
|
||||||
|
# "{} times. Threshold is >= {} times.".format(
|
||||||
|
# current_alert_status,
|
||||||
|
# value,
|
||||||
|
# alert['id'],
|
||||||
|
# tag,
|
||||||
|
# alert['alert_tags'][tag_count],
|
||||||
|
# occurrences_threshold))
|
||||||
|
# else:
|
||||||
|
# # WE RETURN NONE IF NO ALERT (EITHER RECOVERY OR WARNING OR
|
||||||
|
# # CRITICAL) NEEDS TO BE FIRED
|
||||||
|
# logger.debug("Value {} of {} for tag {} has occurred {} time(s) < "
|
||||||
|
# "threshold of {}".format(
|
||||||
|
# value,
|
||||||
|
# alert['id'],
|
||||||
|
# tag,
|
||||||
|
# alert['alert_tags'][tag_count],
|
||||||
|
# occurrences_threshold))
|
||||||
|
# if availability:
|
||||||
|
# logger.info("Sending availability stat")
|
||||||
|
# send_metrics(alert, 1, result, 'service_level')
|
||||||
|
# return None
|
||||||
|
|
||||||
|
#logger.debug(
|
||||||
|
# "Alert {}->[{}]->{}, Occurrences={}".format(
|
||||||
|
# alert['id'], tag, current_alert_status,
|
||||||
|
# alert['alert_tags'][tag_count]))
|
||||||
|
#return alert_entity, alert_body, alert['alert_tags'][tag], alert_tags, md5(tag.encode('utf-8')).hexdigest()[:10]
|
||||||
|
|
||||||
|
|
||||||
|
def check_kairosdb_alert(
|
||||||
|
alert_config,
|
||||||
|
service_config,
|
||||||
|
logger,
|
||||||
|
production_mode=True):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
alert_config (dict): Config of the alert to run
|
||||||
|
service_config (dict): Holds things like urls, tokens and other things
|
||||||
|
logger (log object): does the logging
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
availability = False
|
||||||
|
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
|
||||||
|
# START AT THE SAME TIME
|
||||||
|
wait_time = random.randint(0, alert_config['interval'])
|
||||||
|
logger.info(
|
||||||
|
"ALERT_CONFIG: {}\tsleep: {}".format(
|
||||||
|
alert_config['id'],
|
||||||
|
wait_time))
|
||||||
|
sleep(wait_time)
|
||||||
|
# For metrics with availability set to true, we default the interval to 5
|
||||||
|
# mins due Grafana limitations
|
||||||
|
if 'availability' in alert_config and alert_config['availability']:
|
||||||
|
availability = True
|
||||||
|
# ====================
|
||||||
|
# EACH CHECK JUST LOOPS
|
||||||
|
# ====================
|
||||||
|
ret = None
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
send_stat("check_run", 1, {'id': alert_config['id']})
|
||||||
|
# BUILD URL FOR KAIROSDB METRICS AND QUERY FOR RESULTS
|
||||||
|
query_url = os.path.join(
|
||||||
|
service_config['kairosdb_url'] +
|
||||||
|
"api/v1/datapoints/query")
|
||||||
|
ret = requests.post(
|
||||||
|
query_url,
|
||||||
|
data=json.dumps(
|
||||||
|
alert_config['query']),
|
||||||
|
timeout=service_config['timeout'])
|
||||||
|
assert ret.status_code == 200
|
||||||
|
|
||||||
|
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
|
||||||
|
results = ret.json()['queries'][0]['results']
|
||||||
|
logger.debug(
|
||||||
|
"Got back {} results for alert {}".format(
|
||||||
|
len(results), alert_config['id']))
|
||||||
|
log_alert_results(results, alert_config, logger)
|
||||||
|
alert_list = []
|
||||||
|
|
||||||
|
# LOOP THROUGH ALL THE RESULTS
|
||||||
|
for r in results:
|
||||||
|
alert_tags = (get_alert_tags(alert_config, r)
|
||||||
|
if has_custom_alert_routing(alert_config) else None)
|
||||||
|
|
||||||
|
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
|
||||||
|
# THEREIN AND EXAMINE FOR FAILURE
|
||||||
|
if r['values']:
|
||||||
|
minvalue = min([x[1] for x in r['values']])
|
||||||
|
maxvalue = max([x[1] for x in r['values']])
|
||||||
|
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
|
||||||
|
# AN OBJECT
|
||||||
|
alert_list.append(
|
||||||
|
build_alert_message(
|
||||||
|
alert_config,
|
||||||
|
minvalue,
|
||||||
|
maxvalue,
|
||||||
|
r,
|
||||||
|
logger,
|
||||||
|
availability,
|
||||||
|
alert_tags=alert_tags))
|
||||||
|
|
||||||
|
# THIS MEANS OUR KAIROS QUERY RETURNED NOTHING. COULD BE NETWORK
|
||||||
|
# ISSUES. WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
|
||||||
|
# AFTER X OCCURRENCES OF KAIROS NOT RETURNING DATA WE WILL CLEAR
|
||||||
|
# AOM'S BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A
|
||||||
|
# LATER OCCURRENCE CAUSING A PREMATURE ALERT.
|
||||||
|
# A NO-OP IF NO HISTORY.
|
||||||
|
elif 'alert_tags' in alert_config:
|
||||||
|
for key in alert_config['alert_tags']:
|
||||||
|
if ('count' not in key and 'noresult' not in key and
|
||||||
|
alert_config['alert_tags'][key] > 0):
|
||||||
|
key_noresult = key + "_noresult"
|
||||||
|
key_count = key + "_count"
|
||||||
|
if alert_config['alert_tags'][key_noresult] > 10:
|
||||||
|
logger.info("{} occurrences of no results back "
|
||||||
|
"for {}, clear out counts for tag '{}'".format(
|
||||||
|
alert_config['alert_tags'][key_noresult],
|
||||||
|
alert_config['id'], key))
|
||||||
|
alert_list.append(
|
||||||
|
build_alert_message(
|
||||||
|
alert_config,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
None,
|
||||||
|
logger,
|
||||||
|
availability,
|
||||||
|
key,
|
||||||
|
alert_tags=alert_tags))
|
||||||
|
alert_config['alert_tags'][key] = 0
|
||||||
|
alert_config['alert_tags'][key_count] = 0
|
||||||
|
alert_config['alert_tags'][key_noresult] = 0
|
||||||
|
else:
|
||||||
|
alert_config['alert_tags'][key_noresult] += 1
|
||||||
|
logger.info("{} occurrences of no results back "
|
||||||
|
"for {}, tag '{}'".format(
|
||||||
|
alert_config['alert_tags'][key_noresult],
|
||||||
|
alert_config['id'], key))
|
||||||
|
|
||||||
|
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
|
||||||
|
for alert in [x for x in alert_list if x is not None]:
|
||||||
|
if production_mode:
|
||||||
|
send_alerts(
|
||||||
|
alert,
|
||||||
|
copy.deepcopy(alert_config),
|
||||||
|
service_config['victorops_url'],
|
||||||
|
service_config['slack_url'],
|
||||||
|
service_config['slack_token'],
|
||||||
|
service_config['smtp_server'],
|
||||||
|
service_config['sensu_endpoint'],
|
||||||
|
service_config['uchiwa_url'],
|
||||||
|
logger)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"Sending alert for: {}".format(
|
||||||
|
alert_config.get('id')))
|
||||||
|
|
||||||
|
# HANDLE THE UNEXPECTED
|
||||||
|
except TimeoutError:
|
||||||
|
logger.error("Query [{}] took to long to run".format(
|
||||||
|
alert_config['id']))
|
||||||
|
except AssertionError:
|
||||||
|
logger.error(
|
||||||
|
"KairsoDB query failed: {}\n"
|
||||||
|
"HTTP status code:\t{}\n"
|
||||||
|
"Error Message:\t{}\nQuery:\n"
|
||||||
|
"{}".format(
|
||||||
|
ret.url,
|
||||||
|
ret.status_code,
|
||||||
|
ret.text,
|
||||||
|
alert_config['query']))
|
||||||
|
except gaierror:
|
||||||
|
logger.error(
|
||||||
|
"Unable to connect to smtp server: {}".format(
|
||||||
|
service_config['smtp_server']))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Unhandled exception {} on alert: {}".format(
|
||||||
|
str(e), alert_config['id']))
|
||||||
|
finally:
|
||||||
|
sleep(alert_config['interval'])
|
||||||
|
|
||||||
|
|
||||||
|
def check_prometheus_alert(
|
||||||
|
alert_config,
|
||||||
|
service_config,
|
||||||
|
logger,
|
||||||
|
production_mode=True):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
alert_config (dict): Config of the alert to run
|
||||||
|
service_config (dict): Holds things like urls, tokens and other things
|
||||||
|
logger (log object): does the logging
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
|
# SLEEP A RANDOM TIME BETWEEN 0 AND INTERVAL SO THAT ALL ALERTS DON'T
|
||||||
|
# START AT THE SAME TIME
|
||||||
|
wait_time = random.randint(0, alert_config['interval'])
|
||||||
|
logger.info(
|
||||||
|
"ALERT_CONFIG: {}\tsleep: {}".format(
|
||||||
|
alert_config['id'],
|
||||||
|
wait_time))
|
||||||
|
sleep(wait_time)
|
||||||
|
# For metrics with availability set to true, we default the interval to 5
|
||||||
|
# mins due to Grafana limitations
|
||||||
|
availability = bool(alert_config.get('availability'))
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# EACH CHECK JUST LOOPS
|
||||||
|
# ====================
|
||||||
|
ret = None
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
send_stat("check_run", 1, {'id': alert_config['id']})
|
||||||
|
prom_api = PromAPI(endpoint=alert_config['prometheus_url'])
|
||||||
|
ret = prom_api.query_range(
|
||||||
|
query=alert_config['query'],
|
||||||
|
start=alert_config['start_time'],
|
||||||
|
end=alert_config['end_time'],
|
||||||
|
duration=alert_config['interval'])
|
||||||
|
|
||||||
|
assert ret['status'] == 'success'
|
||||||
|
|
||||||
|
# GOT DATA BACK, NOW TO COMPARE IT TO THE THRESHOLD
|
||||||
|
results = ret['data']['result']
|
||||||
|
logger.debug(
|
||||||
|
"Got back {} results for alert {}".format(
|
||||||
|
len(results), alert_config['id']))
|
||||||
|
log_alert_results(results, alert_config, logger)
|
||||||
|
alert_list = []
|
||||||
|
|
||||||
|
# LOOP THROUGH ALL THE RESULTS
|
||||||
|
for r in results:
|
||||||
|
alert_tags = (get_alert_tags(alert_config, r) if
|
||||||
|
has_custom_alert_routing(alert_config) else None)
|
||||||
|
|
||||||
|
# REARRANGE RESULT TO MORE CLOSELY MATCH KAIROSDB RESULT
|
||||||
|
r['tags'] = {key: [value]
|
||||||
|
for (key, value) in r['metric'].items()}
|
||||||
|
|
||||||
|
# OUR QUERY RETURNED SOME VALUES - FIND MIN AND MAX VALUES
|
||||||
|
# THEREIN AND EXAMINE FOR FAILURE
|
||||||
|
if r['values']:
|
||||||
|
raw_values = [value for _, value in r['values']]
|
||||||
|
min_value = float(min(raw_values))
|
||||||
|
max_value = float(max(raw_values))
|
||||||
|
# SEND VALUES TO BUILD_ALERT_MESSAGE, WHICH RETURNS NONE OR
|
||||||
|
# AN OBJECT
|
||||||
|
alert_list.append(
|
||||||
|
build_alert_message(
|
||||||
|
alert_config,
|
||||||
|
min_value,
|
||||||
|
max_value,
|
||||||
|
r,
|
||||||
|
logger,
|
||||||
|
availability,
|
||||||
|
alert_tags=alert_tags))
|
||||||
|
|
||||||
|
# THIS MEANS OUR QUERY RETURNED NOTHING. COULD BE NETWORK ISSUES
|
||||||
|
# WE WILL TOLERATE THIS FOR X OCCURRENCES. (X=10)
|
||||||
|
# AFTER X OCCURRENCES OF NOT RETURNING DATA WE WILL CLEAR AOM'S
|
||||||
|
# BRAIN FOR THIS ALERT ID AND TAG COMBINATION TO AVOID A LATER
|
||||||
|
# OCCURRENCE CAUSING A PREMATURE ALERT. A NO-OP IF NO HISTORY.
|
||||||
|
elif 'alert_tags' in alert_config:
|
||||||
|
for key in alert_config['alert_tags']:
|
||||||
|
if ('count' not in key and 'noresult' not in key and
|
||||||
|
alert_config['alert_tags'][key] > 0):
|
||||||
|
key_noresult = key + "_noresult"
|
||||||
|
key_count = key + "_count"
|
||||||
|
if alert_config['alert_tags'][key_noresult] > 10:
|
||||||
|
logger.info("{} occurrences of no results back "
|
||||||
|
"for {}, clear out counts for tag '{}'".format(
|
||||||
|
alert_config['alert_tags'][key_noresult],
|
||||||
|
alert_config['id'], key))
|
||||||
|
alert_list.append(
|
||||||
|
build_alert_message(
|
||||||
|
alert_config,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
None,
|
||||||
|
logger,
|
||||||
|
availability,
|
||||||
|
key,
|
||||||
|
alert_tags=alert_tags))
|
||||||
|
alert_config['alert_tags'][key] = 0
|
||||||
|
alert_config['alert_tags'][key_count] = 0
|
||||||
|
alert_config['alert_tags'][key_noresult] = 0
|
||||||
|
else:
|
||||||
|
alert_config['alert_tags'][key_noresult] += 1
|
||||||
|
logger.info("{} occurrences of no results back "
|
||||||
|
"for {}, tag '{}'".format(
|
||||||
|
alert_config['alert_tags'][key_noresult],
|
||||||
|
alert_config['id'], key))
|
||||||
|
|
||||||
|
# SEND ALL ALERTS FOUND TO THE ALERT HANDLERS THAT ARE NOT NONE
|
||||||
|
for alert in [x for x in alert_list if x is not None]:
|
||||||
|
if production_mode:
|
||||||
|
send_alerts(
|
||||||
|
alert,
|
||||||
|
copy.deepcopy(alert_config),
|
||||||
|
service_config['victorops_url'],
|
||||||
|
service_config['slack_url'],
|
||||||
|
service_config['slack_token'],
|
||||||
|
service_config['smtp_server'],
|
||||||
|
service_config['sensu_endpoint'],
|
||||||
|
service_config['uchiwa_url'],
|
||||||
|
logger)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
"Sending alert {}".format(
|
||||||
|
alert_config.get('id')))
|
||||||
|
|
||||||
|
# HANDLE THE UNEXPECTED
|
||||||
|
except TimeoutError:
|
||||||
|
logger.error(
|
||||||
|
"Query [{}] took to long to run".format(
|
||||||
|
alert_config['id']))
|
||||||
|
except AssertionError:
|
||||||
|
logger.error(
|
||||||
|
"Prometheus query failed:\n"
|
||||||
|
"Status:\t{}\n"
|
||||||
|
"Error Type:\t{}\n"
|
||||||
|
"Error Message:\t{}\n"
|
||||||
|
"Query:\n{}".format(
|
||||||
|
ret['status'],
|
||||||
|
ret['errorType'],
|
||||||
|
ret['error'],
|
||||||
|
alert_config['query']))
|
||||||
|
except gaierror:
|
||||||
|
logger.error(
|
||||||
|
"Unable to connect to smtp server: {}".format(
|
||||||
|
service_config['smtp_server']))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Unhandled exception {} on alert: {}".format(
|
||||||
|
str(e), alert_config['id']))
|
||||||
|
finally:
|
||||||
|
sleep(alert_config['interval'])
|
||||||
|
|
||||||
|
|
||||||
|
# LOG ALERT RESULTS SO WE CAN DEBUG IF NEEDED
|
||||||
|
def log_alert_results(results, alert_config, logger):
|
||||||
|
"""
|
||||||
|
Logs the results broken out by tag provided in the alert_config to the
|
||||||
|
logger for debuging
|
||||||
|
Args:
|
||||||
|
results: the results object returned from the call to kairosdb, of just
|
||||||
|
the results
|
||||||
|
alert_config: config object of the alert
|
||||||
|
logger (log object): does the logging
|
||||||
|
Returns:
|
||||||
|
None, logs to logger
|
||||||
|
"""
|
||||||
|
|
||||||
|
for v in results:
|
||||||
|
logger.debug("{} - Result: {}".format(alert_config['id'], v))
|
||||||
|
|
||||||
|
|
||||||
|
def send_alerts(
|
||||||
|
alert,
|
||||||
|
alert_config,
|
||||||
|
victorops_url,
|
||||||
|
slack_url,
|
||||||
|
slack_token,
|
||||||
|
smtp_server,
|
||||||
|
sensu_endpoint,
|
||||||
|
uchiwa_url,
|
||||||
|
logger):
|
||||||
|
"""
|
||||||
|
Sends out the alerts to VO, Email, and/or Slack
|
||||||
|
Args:
|
||||||
|
alert: the alert tuple:
|
||||||
|
alert[0] == subject alert[1] == body alert[3] == alert_tags alert[4] == md5sum
|
||||||
|
alert_config: the alert configuration object
|
||||||
|
victorops_url: url to victorops
|
||||||
|
slack_url: url to slack api calls
|
||||||
|
slack_token: the token for the alert
|
||||||
|
smtp_server: The server to send mail messages too
|
||||||
|
sensu_endpoint:
|
||||||
|
uchiwa_url:
|
||||||
|
logger (log object): does the logging
|
||||||
|
Returns: None
|
||||||
|
"""
|
||||||
|
# GOING TO USE THIS FOR TAGGING SOME METRICS ABOUT WHAT ALERT CHANNEL WAS
|
||||||
|
# USED
|
||||||
|
tag_dict = dict()
|
||||||
|
tag_dict['alert'] = alert_config['id']
|
||||||
|
|
||||||
|
is_custom_alert_routing = has_custom_alert_routing(alert_config)
|
||||||
|
if is_custom_alert_routing:
|
||||||
|
alert_routing = alert_config.get('alert_routing_lookup', {})
|
||||||
|
alert_config['alerts'] = alert_routing.get(
|
||||||
|
alert[3], alert_config['alerts']['lookup']['default'])
|
||||||
|
|
||||||
|
# once we move all alerts into sensu, we dont need to tho this
|
||||||
|
if 'filters' in alert_config:
|
||||||
|
logger.info(
|
||||||
|
"alert_status : {}, alert_config: {}".format(
|
||||||
|
alert[2], alert_config))
|
||||||
|
if 'slack_subdue' in alert_config['filters'] and alert[2] in (
|
||||||
|
1, 2) and alert_config['filters']['slack_subdue']:
|
||||||
|
# unless the alert is critical we dont send it
|
||||||
|
logger.info("Removed slack, alert_config: {}".format(alert_config))
|
||||||
|
alert_config['alerts'].pop('slack', None)
|
||||||
|
if ('victorops_subdue' in alert_config['filters'] and
|
||||||
|
alert[2] in (1, 2) and
|
||||||
|
alert_config['filters']['victorops_subdue']):
|
||||||
|
# unless the alert is critical we dont send it
|
||||||
|
alert_config['alerts'].pop('vo', None)
|
||||||
|
logger.info("Removed vo, alert_config: {}".format(alert_config))
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# VICTOROPS HANDLING
|
||||||
|
# ====================
|
||||||
|
if 'vo' in alert_config['alerts']:
|
||||||
|
for notify in alert_config['alerts']['vo']:
|
||||||
|
payload = dict(entity_id=alert[0],
|
||||||
|
message_type=alert_status[alert[2]],
|
||||||
|
state_message=alert[1])
|
||||||
|
r = None
|
||||||
|
try:
|
||||||
|
r = requests.post(
|
||||||
|
victorops_url + notify,
|
||||||
|
data=json.dumps(payload),
|
||||||
|
headers={
|
||||||
|
"Content-type": "application-json"})
|
||||||
|
assert r.status_code == 200
|
||||||
|
# Record a VO alert sent event
|
||||||
|
tag_dict['alert_channel_type'] = "VictorOps"
|
||||||
|
tag_dict['who'] = "vo:{}".format(notify)
|
||||||
|
send_stat("alert_channel", 1, tag_dict)
|
||||||
|
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
|
||||||
|
except AssertionError:
|
||||||
|
logger.error(
|
||||||
|
"Post to VO failed for {}\n{}:\t{}".format(
|
||||||
|
alert_config['id'], r.status_code, r.text))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unhandled exception for alert_id:{} "
|
||||||
|
"when posting to VO: {}".format(
|
||||||
|
alert_config['id'], str(e)))
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# EMAIL HANDLING
|
||||||
|
# ====================
|
||||||
|
if 'email' in alert_config['alerts'] and (
|
||||||
|
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
|
||||||
|
msg = MIMEText(alert[1])
|
||||||
|
msg['Subject'] = '{} Status: {}'.format(
|
||||||
|
alert[0], alert_status[alert[2]])
|
||||||
|
msg['From'] = 'aom@qualtrics.com'
|
||||||
|
msg['To'] = ','.join(
|
||||||
|
[x + "@qualtrics.com" for x in alert_config['alerts']['email']])
|
||||||
|
try:
|
||||||
|
s = smtplib.SMTP(smtp_server)
|
||||||
|
s.send_message(msg)
|
||||||
|
s.quit()
|
||||||
|
# Record an Email alert sent event
|
||||||
|
tag_dict['alert_channel_type'] = "Email"
|
||||||
|
tag_dict['who'] = "email:{}".format(msg['To'])
|
||||||
|
send_stat("alert_channel", 1, tag_dict)
|
||||||
|
# logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
"Unhandled exception when sending mail for {} to {}\n{}".format(
|
||||||
|
alert_config['id'], smtp_server, str(e)))
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# SENSU HANDLING
|
||||||
|
# ====================
|
||||||
|
if 'sensu' in alert_config['alerts']:
|
||||||
|
# Dictionary with static values for Sensu
|
||||||
|
sensu_dict = {
|
||||||
|
'source': 'AOM',
|
||||||
|
'refresh': 3600,
|
||||||
|
'occurrences': 1,
|
||||||
|
'name': alert_config['id']+'__'+alert[4]}
|
||||||
|
# if alert[3]:
|
||||||
|
# logger.info(alert)
|
||||||
|
# sensu_dict['name'] = '_'.join(
|
||||||
|
# [alert_config['id']] + sorted(list(alert[3])))
|
||||||
|
if 'refresh' in alert_config:
|
||||||
|
sensu_dict['refresh'] = alert_config['refresh']
|
||||||
|
sensu_dict['interval'] = alert_config['interval']
|
||||||
|
sensu_dict['handlers'] = []
|
||||||
|
sensu_dict['dashboard'] = alert_config['url']
|
||||||
|
if 'dependencies' in alert_config['alerts']['sensu'].keys():
|
||||||
|
sensu_dict['dependencies'] = (alert_config['alerts']
|
||||||
|
['sensu']['dependencies'])
|
||||||
|
if 'victorops' in alert_config['alerts']['sensu'].keys():
|
||||||
|
sensu_dict['handlers'].append("victorops")
|
||||||
|
sensu_dict['routing_key'] = (alert_config['alerts']
|
||||||
|
['sensu']['victorops'])
|
||||||
|
# # Leave this here until we have email support in Sensu
|
||||||
|
# if 'email' in alert_config['alerts']['sensu'].keys():
|
||||||
|
# sensu_dict['handlers'].append("email")
|
||||||
|
# # verify this option
|
||||||
|
# sensu_dict['email'] = alert_config['alerts']['sensu']['email']
|
||||||
|
if 'slack' in alert_config['alerts']['sensu'].keys():
|
||||||
|
sensu_dict['handlers'].append("slack")
|
||||||
|
sensu_dict['slack_channel'] = (
|
||||||
|
alert_config['alerts']['sensu']['slack'])
|
||||||
|
# Format alert message
|
||||||
|
sensu_dict['dashboard'] = (
|
||||||
|
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
|
||||||
|
alert_config['url'], uchiwa_url, alert_config['id']))
|
||||||
|
if 'jira' in alert_config['alerts']['sensu'].keys():
|
||||||
|
sensu_dict['handlers'].append("jira")
|
||||||
|
sensu_dict.update(alert_config['alerts']['sensu']['jira'])
|
||||||
|
if 'filters' in alert_config:
|
||||||
|
sensu_dict['filters'] = alert_config['filters']
|
||||||
|
# 0 = OK, 1 = WARNING, 2 = CRITICAL
|
||||||
|
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
|
||||||
|
sensu_dict['status'] = sensu_status[alert[2]]
|
||||||
|
sensu_dict['output'] = alert[1]
|
||||||
|
|
||||||
|
r = None
|
||||||
|
try:
|
||||||
|
user = os.environ['API_USER']
|
||||||
|
passwd = os.environ['API_PASS']
|
||||||
|
r = requests.post(
|
||||||
|
sensu_endpoint,
|
||||||
|
json.dumps(sensu_dict),
|
||||||
|
auth=(
|
||||||
|
user,
|
||||||
|
passwd))
|
||||||
|
assert r.status_code == 202
|
||||||
|
except AssertionError:
|
||||||
|
logger.error(
|
||||||
|
"Post to Sensu failed {}\n{}:\t{}".format(
|
||||||
|
alert_config['id'],
|
||||||
|
r.status_code,
|
||||||
|
r.text))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unhandled exception for alert_id:{} "
|
||||||
|
"when posting to Sensu: {}".format(
|
||||||
|
alert_config['id'], str(e)))
|
||||||
|
|
||||||
|
# ====================
|
||||||
|
# SLACK HANDLING - all Slack alerts will go through Sensu
|
||||||
|
# ====================
|
||||||
|
if 'slack' in alert_config['alerts'] and (
|
||||||
|
alert[2] == 0 or alert[2] == 1 or alert[2] == 3):
|
||||||
|
refresh = alert_config.get('refresh', 3600)
|
||||||
|
dashboard = alert_config.get('url', '')
|
||||||
|
sensu_status = {0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 2}
|
||||||
|
sensu_dict2 = {'handlers': ['slack'],
|
||||||
|
'interval': alert_config['interval'],
|
||||||
|
'source': 'AOM',
|
||||||
|
'refresh': refresh,
|
||||||
|
'occurrences': 1,
|
||||||
|
'name': alert_config['id']+'__'+alert[4],
|
||||||
|
'dashboard': dashboard,
|
||||||
|
'status': sensu_status[alert[2]],
|
||||||
|
'output': alert[1]}
|
||||||
|
if is_custom_alert_routing:
|
||||||
|
sensu_dict2['name'] = '_'.join(
|
||||||
|
[alert_config['id']] + list(alert[3]))
|
||||||
|
sensu_dict2['dashboard'] = (
|
||||||
|
"<{}|here> , Uchiwa: <{}?check={}|here> ".format(
|
||||||
|
alert_config['url'], uchiwa_url, alert_config['id']))
|
||||||
|
for channel in alert_config['alerts']['slack']:
|
||||||
|
sensu_dict2['slack_channel'] = channel
|
||||||
|
r = None
|
||||||
|
try:
|
||||||
|
user = os.environ['API_USER']
|
||||||
|
passwd = os.environ['API_PASS']
|
||||||
|
r = requests.post(
|
||||||
|
sensu_endpoint,
|
||||||
|
json.dumps(sensu_dict2),
|
||||||
|
auth=(
|
||||||
|
user,
|
||||||
|
passwd))
|
||||||
|
assert r.status_code == 202
|
||||||
|
except AssertionError:
|
||||||
|
logger.error(
|
||||||
|
"Post to Sensu failed {}\n{}:\t{}".format(
|
||||||
|
alert_config['id'], r.status_code, r.text))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unhandled exception for alert_id:{} when posting"
|
||||||
|
"to Sensu: {}".format(alert_config['id'], str(e)))
|
||||||
|
|
||||||
|
# payload = dict(token=slack_token, channel=channel,
|
||||||
|
# text="{} Status: {}".format(alert[1], alert_status[alert[2]]))
|
||||||
|
# r = None
|
||||||
|
# try:
|
||||||
|
# r = requests.post(slack_url, data=payload)
|
||||||
|
# assert r.status_code == 200
|
||||||
|
# # Record an Slack alert sent event
|
||||||
|
# tag_dict['alert_channel_type'] = "Slack"
|
||||||
|
# tag_dict['who'] = "slack:{}".format(channel)
|
||||||
|
# send_stat("alert_channel", 1, tag_dict)
|
||||||
|
# # logger.info("TestInfo: {} alert for {}".format(alert_status(alert[2]), alert[0]))
|
||||||
|
# except AssertionError:
|
||||||
|
# logger.error("Post to Slack failed for {}\n{}:\t{}".format(alert_config['id'], r.status_code, r.text))
|
||||||
|
# except Exception as e:
|
||||||
|
# logger.error("Unhandled exception for alert_id:{} when posting to Slack: {}".format(alert_config['id'],
|
||||||
|
# str(e)))
|
||||||
|
|
||||||
|
|
||||||
|
def send_metrics(alert, value, result, gaugename='stats'):
|
||||||
|
"""
|
||||||
|
Sends the results from the alert check to statsd
|
||||||
|
Args:
|
||||||
|
alert: The Alert config object that holds the alert['tag'] value.
|
||||||
|
gaugename: The name of the gauge metric we send.
|
||||||
|
value: The value we want to send as a gauge.
|
||||||
|
result: The result object from making the call. Use the data in this
|
||||||
|
object to tag the metric.
|
||||||
|
Returns: None
|
||||||
|
"""
|
||||||
|
# GROUP ALL THE ALERTS TOGETHER SO THAT PEEPS CAN FILTER OUT BY TAG THEIR
|
||||||
|
# SPECIFIC ALERTS
|
||||||
|
result_tags = list(itertools.chain(
|
||||||
|
*[result['tags'][x] for x in alert['tags']]))
|
||||||
|
tag_dict = dict()
|
||||||
|
for x in range(len(alert['tags'])):
|
||||||
|
tag_dict[alert['tags'][x]] = result_tags[x]
|
||||||
|
tag_dict['alert'] = alert['id']
|
||||||
|
|
||||||
|
# SEND THE METRIC
|
||||||
|
send_stat(gaugename, value, tag_dict)
|
||||||
|
|
||||||
|
|
||||||
|
def send_stat(gaugename, value, tag_dict, statprefix='aom'):
|
||||||
|
"""Sends stats value to statsd"""
|
||||||
|
client = StatsClient('telegraf', 8125, statprefix)
|
||||||
|
|
||||||
|
# SUBMIT STATS
|
||||||
|
client.gauge(gaugename, value, tags=tag_dict)
|
||||||
|
|
||||||
|
|
||||||
|
def has_custom_alert_routing(alert_config):
|
||||||
|
"""Checks if alert has custom routing"""
|
||||||
|
return 'lookup' in alert_config['alerts']
|
||||||
|
|
||||||
|
|
||||||
|
def get_alert_tags(alert_config, query_result):
|
||||||
|
"""Retrieves custom tags from alert"""
|
||||||
|
query_tags = []
|
||||||
|
for tag in alert_config['alerts']['lookup']['tags']:
|
||||||
|
if (alert_config.get('query_type') == 'prometheus' and
|
||||||
|
'metric' in query_result and
|
||||||
|
tag in query_result['metric']):
|
||||||
|
query_tags.append(query_result['metric'][tag])
|
||||||
|
elif ('tags' in query_result and tag in query_result['tags']
|
||||||
|
and query_result['tags'][tag]):
|
||||||
|
query_tags.append(query_result['tags'][tag][0])
|
||||||
|
return tuple(query_tags)
|
||||||
123
AoM_Service/library/serviceapp/test_alert.py
Executable file
123
AoM_Service/library/serviceapp/test_alert.py
Executable file
@@ -0,0 +1,123 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self) :
|
||||||
|
self.cache = {}
|
||||||
|
self.level = {}
|
||||||
|
self.id = "id"
|
||||||
|
|
||||||
|
def set_level(self, k, v) :
|
||||||
|
self.level[k] = v
|
||||||
|
|
||||||
|
def get_level(self, k) :
|
||||||
|
if not k in self.level :
|
||||||
|
return None
|
||||||
|
return self.level[k]
|
||||||
|
|
||||||
|
def init_for_tags(self, *args) :
|
||||||
|
pass
|
||||||
|
|
||||||
|
def occurrences(self) :
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def get_threshold(self, upper, warning) :
|
||||||
|
if warning :
|
||||||
|
return None, False
|
||||||
|
if upper :
|
||||||
|
return 10, True
|
||||||
|
else :
|
||||||
|
return 0, True
|
||||||
|
|
||||||
|
def get_tags(self) :
|
||||||
|
return "tagsC, tagsD".split(", ")
|
||||||
|
|
||||||
|
def set_for_tags(self, key, value) :
|
||||||
|
if not key in self.cache :
|
||||||
|
self.cache[key] = 0
|
||||||
|
self.cache[key] = value
|
||||||
|
|
||||||
|
def get_for_tags(self, key) :
|
||||||
|
if not key in self.cache :
|
||||||
|
self.cache[key] = 0
|
||||||
|
return self.cache[key]
|
||||||
|
|
||||||
|
class Mock_Result() :
|
||||||
|
def __init__(self) :
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __getitem__(self, key) :
|
||||||
|
if key == "tags" :
|
||||||
|
return self
|
||||||
|
else :
|
||||||
|
return key
|
||||||
|
|
||||||
|
class Mock_Logger() :
|
||||||
|
def __init__(self) :
|
||||||
|
for k in ["error", "warn", "debug", "info", "warning"] :
|
||||||
|
setattr(self, k, self.log)
|
||||||
|
|
||||||
|
def log(self, *args) :
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Test_Alert(unittest.TestCase) :
|
||||||
|
def test_set_tags(self) :
|
||||||
|
import alert
|
||||||
|
ac = Mock_Alert_Config()
|
||||||
|
res = Mock_Result()
|
||||||
|
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), None, None, -1, 11)
|
||||||
|
self.assertEqual(al.get_tags(), "instance")
|
||||||
|
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
|
||||||
|
self.assertEqual(al.get_tags(), "tagsA, tagsB")
|
||||||
|
|
||||||
|
al.set_tags("a, b, c", res)
|
||||||
|
self.assertEqual(al.get_tags(), "a, b, c")
|
||||||
|
|
||||||
|
al.set_tags("a, b, c", res)
|
||||||
|
self.assertEqual(al.get_tags(), "a, b, c")
|
||||||
|
|
||||||
|
def test_firing(self) :
|
||||||
|
import alert
|
||||||
|
ac = Mock_Alert_Config()
|
||||||
|
res = Mock_Result()
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 11)
|
||||||
|
self.assertTrue(al.get_firing())
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 11)
|
||||||
|
self.assertTrue(al.get_firing())
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, -1, 9)
|
||||||
|
self.assertTrue(al.get_firing())
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 1, 9)
|
||||||
|
self.assertFalse(al.get_firing())
|
||||||
|
|
||||||
|
def test_str(self) :
|
||||||
|
import alert
|
||||||
|
ac = Mock_Alert_Config()
|
||||||
|
res = Mock_Result()
|
||||||
|
alert = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
|
||||||
|
|
||||||
|
self.assertEqual(alert.name(), "Metric: id for tagsA, tagsB")
|
||||||
|
self.assertEqual(alert.body(), "")
|
||||||
|
|
||||||
|
def test_occurrences(self) :
|
||||||
|
import alert
|
||||||
|
ac = Mock_Alert_Config()
|
||||||
|
res = Mock_Result()
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 10)
|
||||||
|
self.assertEqual(False, al.occurrences_breached)
|
||||||
|
al.set_occurrences()
|
||||||
|
al.set_occurrences()
|
||||||
|
al.set_occurrences()
|
||||||
|
self.assertEqual(False, al.occurrences_breached)
|
||||||
|
self.assertEqual(0, ac.get_for_tags(al.get_tags()))
|
||||||
|
|
||||||
|
al = alert.Alert(ac, Mock_Logger(), "tagsA, tagsB", res, 0, 11)
|
||||||
|
self.assertEqual(True, al.occurrences_breached)
|
||||||
|
al.set_occurrences()
|
||||||
|
al.set_occurrences()
|
||||||
|
al.set_occurrences()
|
||||||
|
self.assertEqual(True, al.occurrences_breached)
|
||||||
|
self.assertEqual(4, ac.get_for_tags(al.get_tags()))
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
33
AoM_Service/library/serviceapp/test_alert_factory.py
Executable file
33
AoM_Service/library/serviceapp/test_alert_factory.py
Executable file
@@ -0,0 +1,33 @@
|
|||||||
|
import unittest
|
||||||
|
import alert_factory
|
||||||
|
|
||||||
|
class Mock_Alert() :
|
||||||
|
def __init__(self, *args) :
|
||||||
|
self.args = args
|
||||||
|
|
||||||
|
class Mock_Logger() :
|
||||||
|
def __init__(self) :
|
||||||
|
self.info = self.log
|
||||||
|
self.warn = self.log
|
||||||
|
self.warning = self.log
|
||||||
|
self.error = self.log
|
||||||
|
self.debug = self.log
|
||||||
|
|
||||||
|
def log(self, *args, **kwargs) :
|
||||||
|
print(args, kwargs)
|
||||||
|
|
||||||
|
class Test_Alert_Factory(unittest.TestCase) :
|
||||||
|
def setUp(self) :
|
||||||
|
self.was = alert_factory.Alert
|
||||||
|
alert_factory.Alert = Mock_Alert
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
alert_factory.Alert = self.was
|
||||||
|
|
||||||
|
def test(self) :
|
||||||
|
af = alert_factory.Alert_Factory(None, Mock_Logger())
|
||||||
|
alert = af.build(0, 5, None, "tagA, tagB", False, "tagC, tagD")
|
||||||
|
self.assertTrue(type(alert) == Mock_Alert)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
8
AoM_Service/library/serviceapp/test_service.py
Executable file
8
AoM_Service/library/serviceapp/test_service.py
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test_Service(unittest.TestCase) :
|
||||||
|
def test(self) :
|
||||||
|
raise Exception("not impl")
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
14
AoM_Service/library/serviceapp/test_threshold.py
Executable file
14
AoM_Service/library/serviceapp/test_threshold.py
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test_Threshold(unittest.TestCase) :
|
||||||
|
def test(self) :
|
||||||
|
import threshold
|
||||||
|
tl = threshold.Threshold(5)
|
||||||
|
|
||||||
|
self.assertFalse(tl.can_breach())
|
||||||
|
|
||||||
|
self.assertFalse(tl.exceeds(7))
|
||||||
|
self.assertFalse(tl.exceeds(3))
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
14
AoM_Service/library/serviceapp/test_threshold_lower.py
Executable file
14
AoM_Service/library/serviceapp/test_threshold_lower.py
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test_Threshold_Lower(unittest.TestCase) :
|
||||||
|
def test(self) :
|
||||||
|
import threshold_lower
|
||||||
|
tl = threshold_lower.Threshold_Lower(5)
|
||||||
|
|
||||||
|
self.assertTrue(tl.can_breach)
|
||||||
|
|
||||||
|
self.assertTrue(tl.exceeds(3))
|
||||||
|
self.assertFalse(tl.exceeds(7))
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
14
AoM_Service/library/serviceapp/test_threshold_upper.py
Executable file
14
AoM_Service/library/serviceapp/test_threshold_upper.py
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test_Threshold_Upper(unittest.TestCase) :
|
||||||
|
def test(self) :
|
||||||
|
import threshold_upper
|
||||||
|
tl = threshold_upper.Threshold_Upper(5)
|
||||||
|
|
||||||
|
self.assertTrue(tl.can_breach)
|
||||||
|
|
||||||
|
self.assertTrue(tl.exceeds(7))
|
||||||
|
self.assertFalse(tl.exceeds(3))
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
157
AoM_Service/library/serviceapp/test_thresholds.py
Executable file
157
AoM_Service/library/serviceapp/test_thresholds.py
Executable file
@@ -0,0 +1,157 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self) :
|
||||||
|
self.upCrit = 10
|
||||||
|
self.lowCrit = 1
|
||||||
|
|
||||||
|
def get_threshold(self, upper, warn) :
|
||||||
|
if upper and warn :
|
||||||
|
return None, False
|
||||||
|
elif upper and not warn :
|
||||||
|
return self.upCrit, True
|
||||||
|
elif not upper and warn :
|
||||||
|
return None, False
|
||||||
|
else:
|
||||||
|
return self.lowCrit, True
|
||||||
|
|
||||||
|
class Test_Thresholds(unittest.TestCase) :
|
||||||
|
def test_breached_both(self) :
|
||||||
|
import thresholds
|
||||||
|
alert_config = Mock_Alert_Config()
|
||||||
|
t = thresholds.Thresholds(alert_config)
|
||||||
|
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit+1)
|
||||||
|
|
||||||
|
should_fire = [
|
||||||
|
t.critical_breached(),
|
||||||
|
t.lower_breached(),
|
||||||
|
t.upper_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.CRITICAL),
|
||||||
|
t.end_breached(t.LOWER),
|
||||||
|
t.end_breached(t.UPPER),
|
||||||
|
|
||||||
|
t.get_breached(),
|
||||||
|
t.get_breached(level=t.CRITICAL),
|
||||||
|
t.get_breached(end=t.LOWER),
|
||||||
|
t.get_breached(end=t.UPPER),
|
||||||
|
]
|
||||||
|
for i in range(len(should_fire)) :
|
||||||
|
self.assertTrue(should_fire[i], i)
|
||||||
|
|
||||||
|
should_not_fire = [
|
||||||
|
t.warning_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.WARNING),
|
||||||
|
|
||||||
|
t.get_breached(level=t.WARNING),
|
||||||
|
]
|
||||||
|
for i in range(len(should_not_fire)) :
|
||||||
|
self.assertFalse(should_not_fire[i], i)
|
||||||
|
|
||||||
|
|
||||||
|
def test_breached_lower(self) :
|
||||||
|
import thresholds
|
||||||
|
alert_config = Mock_Alert_Config()
|
||||||
|
t = thresholds.Thresholds(alert_config)
|
||||||
|
t.set_breached(alert_config.lowCrit-1, alert_config.upCrit)
|
||||||
|
|
||||||
|
should_fire = [
|
||||||
|
t.critical_breached(),
|
||||||
|
t.lower_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.CRITICAL),
|
||||||
|
t.end_breached(t.LOWER),
|
||||||
|
|
||||||
|
t.get_breached(),
|
||||||
|
t.get_breached(level=t.CRITICAL),
|
||||||
|
t.get_breached(end=t.LOWER),
|
||||||
|
]
|
||||||
|
for i in range(len(should_fire)) :
|
||||||
|
self.assertTrue(should_fire[i], i)
|
||||||
|
|
||||||
|
should_not_fire = [
|
||||||
|
t.warning_breached(),
|
||||||
|
t.upper_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.WARNING),
|
||||||
|
t.end_breached(t.UPPER),
|
||||||
|
|
||||||
|
t.get_breached(level=t.WARNING),
|
||||||
|
t.get_breached(end=t.UPPER),
|
||||||
|
]
|
||||||
|
for i in range(len(should_not_fire)) :
|
||||||
|
self.assertFalse(should_not_fire[i], i)
|
||||||
|
|
||||||
|
def test_breached_upper(self) :
|
||||||
|
import thresholds
|
||||||
|
alert_config = Mock_Alert_Config()
|
||||||
|
t = thresholds.Thresholds(alert_config)
|
||||||
|
t.set_breached(alert_config.lowCrit, alert_config.upCrit+1)
|
||||||
|
|
||||||
|
should_fire = [
|
||||||
|
t.critical_breached(),
|
||||||
|
t.upper_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.CRITICAL),
|
||||||
|
t.end_breached(t.UPPER),
|
||||||
|
|
||||||
|
t.get_breached(),
|
||||||
|
t.get_breached(level=t.CRITICAL),
|
||||||
|
t.get_breached(end=t.UPPER),
|
||||||
|
]
|
||||||
|
for i in range(len(should_fire)) :
|
||||||
|
self.assertTrue(should_fire[i], i)
|
||||||
|
|
||||||
|
for i in [
|
||||||
|
t.warning_breached(),
|
||||||
|
t.lower_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.WARNING),
|
||||||
|
t.end_breached(t.LOWER),
|
||||||
|
|
||||||
|
t.get_breached(level=t.WARNING),
|
||||||
|
t.get_breached(end=t.LOWER),
|
||||||
|
] :
|
||||||
|
self.assertFalse(i)
|
||||||
|
|
||||||
|
def test_breached_notset(self) :
|
||||||
|
import thresholds
|
||||||
|
alert_config = Mock_Alert_Config()
|
||||||
|
t = thresholds.Thresholds(alert_config)
|
||||||
|
|
||||||
|
for i in [
|
||||||
|
t.warning_breached(),
|
||||||
|
t.critical_breached(),
|
||||||
|
t.upper_breached(),
|
||||||
|
t.lower_breached(),
|
||||||
|
|
||||||
|
t.level_breached(t.CRITICAL),
|
||||||
|
t.level_breached(t.WARNING),
|
||||||
|
t.end_breached(t.UPPER),
|
||||||
|
t.end_breached(t.LOWER),
|
||||||
|
|
||||||
|
t.get_breached(),
|
||||||
|
t.get_breached(level=t.CRITICAL),
|
||||||
|
t.get_breached(level=t.WARNING),
|
||||||
|
t.get_breached(end=t.UPPER),
|
||||||
|
t.get_breached(end=t.LOWER),
|
||||||
|
] :
|
||||||
|
self.assertFalse(i)
|
||||||
|
|
||||||
|
def test_get_matching(self) :
|
||||||
|
import thresholds
|
||||||
|
alert_config = Mock_Alert_Config()
|
||||||
|
t = thresholds.Thresholds(alert_config)
|
||||||
|
self.assertEqual(4, len([i for i in t.get_thresholds_matching()]))
|
||||||
|
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.CRITICAL)]))
|
||||||
|
self.assertEqual(2, len([i for i in t.get_thresholds_matching(level=t.WARNING)]))
|
||||||
|
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.UPPER)]))
|
||||||
|
self.assertEqual(2, len([i for i in t.get_thresholds_matching(end=t.LOWER)]))
|
||||||
|
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.LOWER)]))
|
||||||
|
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.CRITICAL, level=t.UPPER)]))
|
||||||
|
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.LOWER)]))
|
||||||
|
self.assertEqual(1, len([i for i in t.get_thresholds_matching(end=t.WARNING, level=t.UPPER)]))
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
19
AoM_Service/library/serviceapp/threshold.py
Executable file
19
AoM_Service/library/serviceapp/threshold.py
Executable file
@@ -0,0 +1,19 @@
|
|||||||
|
class Threshold() :
|
||||||
|
def __init__(self, threshold) :
|
||||||
|
self.threshold = threshold
|
||||||
|
self.breached = False
|
||||||
|
|
||||||
|
def can_breach(self) :
|
||||||
|
return False
|
||||||
|
|
||||||
|
def set_breached(self, value) :
|
||||||
|
self.breached = self.exceeds(value)
|
||||||
|
|
||||||
|
def get_breached(self) :
|
||||||
|
return self.breached
|
||||||
|
|
||||||
|
def exceeds(self, value) :
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_threshold(self) :
|
||||||
|
return self.threshold
|
||||||
8
AoM_Service/library/serviceapp/threshold_lower.py
Executable file
8
AoM_Service/library/serviceapp/threshold_lower.py
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
from threshold import Threshold
|
||||||
|
|
||||||
|
class Threshold_Lower(Threshold) :
|
||||||
|
def exceeds(self, value) :
|
||||||
|
return self.threshold > value
|
||||||
|
|
||||||
|
def can_breach(self) :
|
||||||
|
return True
|
||||||
8
AoM_Service/library/serviceapp/threshold_upper.py
Executable file
8
AoM_Service/library/serviceapp/threshold_upper.py
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
from threshold import Threshold
|
||||||
|
|
||||||
|
class Threshold_Upper(Threshold) :
|
||||||
|
def exceeds(self, value) :
|
||||||
|
return self.threshold < value
|
||||||
|
|
||||||
|
def can_breach(self) :
|
||||||
|
return True
|
||||||
67
AoM_Service/library/serviceapp/thresholds.py
Executable file
67
AoM_Service/library/serviceapp/thresholds.py
Executable file
@@ -0,0 +1,67 @@
|
|||||||
|
from threshold_upper import Threshold_Upper
|
||||||
|
from threshold_lower import Threshold_Lower
|
||||||
|
from threshold import Threshold
|
||||||
|
|
||||||
|
class Thresholds() :
|
||||||
|
WARNING = True
|
||||||
|
CRITICAL = False
|
||||||
|
UPPER = True
|
||||||
|
LOWER = False
|
||||||
|
|
||||||
|
def __init__(self, alert_config) :
|
||||||
|
self.alert_config = alert_config
|
||||||
|
self.thresholds = {}
|
||||||
|
for level in [ Thresholds.WARNING, Thresholds.CRITICAL ] :
|
||||||
|
self.thresholds[level] = {}
|
||||||
|
for end in [ Thresholds.UPPER, Thresholds.LOWER ] :
|
||||||
|
constructor = Threshold_Upper
|
||||||
|
if end == Thresholds.LOWER :
|
||||||
|
constructor = Threshold_Lower
|
||||||
|
self.thresholds[level][end] = self.create_threshold(end, level, constructor)
|
||||||
|
|
||||||
|
def create_threshold(self, isUpper, isWarning, constructor) :
|
||||||
|
value, has = self.alert_config.get_threshold(isUpper, isWarning)
|
||||||
|
if not has :
|
||||||
|
constructor = Threshold
|
||||||
|
return constructor(value)
|
||||||
|
|
||||||
|
def warning_breached(self) :
|
||||||
|
return self.level_breached(Thresholds.WARNING)
|
||||||
|
|
||||||
|
def critical_breached(self) :
|
||||||
|
return self.level_breached(Thresholds.CRITICAL)
|
||||||
|
|
||||||
|
def upper_breached(self) :
|
||||||
|
return self.end_breached(Thresholds.UPPER)
|
||||||
|
|
||||||
|
def lower_breached(self) :
|
||||||
|
return self.end_breached(Thresholds.LOWER)
|
||||||
|
|
||||||
|
def level_breached(self, level) :
|
||||||
|
return self.get_breached(level=level)
|
||||||
|
|
||||||
|
def end_breached(self, end) :
|
||||||
|
return self.get_breached(end=end)
|
||||||
|
|
||||||
|
def can_breach(self) :
|
||||||
|
can_breach = [t for t in self.thresholds.get_thresholds_matching() if not type(t) is Threshold]
|
||||||
|
return len(can_breach) > 0
|
||||||
|
|
||||||
|
def get_breached(self, level=None, end=None) :
|
||||||
|
for threshold in self.get_thresholds_matching(level=level, end=end) :
|
||||||
|
if threshold.get_breached() :
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def set_breached(self, min_value, max_value) :
|
||||||
|
for threshold in self.get_thresholds_matching(end=Thresholds.LOWER) :
|
||||||
|
threshold.set_breached(min_value)
|
||||||
|
for threshold in self.get_thresholds_matching(end=Thresholds.UPPER) :
|
||||||
|
threshold.set_breached(max_value)
|
||||||
|
|
||||||
|
def get_thresholds_matching(self, level=None, end=None) :
|
||||||
|
for l in self.thresholds :
|
||||||
|
if level is None or l == level :
|
||||||
|
for e in self.thresholds[l] :
|
||||||
|
if end is None or e == end :
|
||||||
|
yield self.thresholds[l][e]
|
||||||
14
AoM_Service/library/test_alert_config.py
Executable file
14
AoM_Service/library/test_alert_config.py
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Test_Alert_Config(unittest.TestCase):
|
||||||
|
def test(self) :
|
||||||
|
from alert_config import Alert_Config
|
||||||
|
try :
|
||||||
|
Alert_Config(None)
|
||||||
|
self.fail("did not fail on nil yaml_config")
|
||||||
|
except Exception :
|
||||||
|
pass
|
||||||
|
self.assertEqual("a", Alert_Config({"id":"a"}).id)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
54
AoM_Service/library/test_alert_config_list.py
Executable file
54
AoM_Service/library/test_alert_config_list.py
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
import unittest
|
||||||
|
import alert_config_list
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self, id) :
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
class Test_Alert_Config_List(unittest.TestCase):
|
||||||
|
def setUp(self) :
|
||||||
|
self.was = alert_config_list.Alert_Config
|
||||||
|
alert_config_list.Alert_Config = Mock_Alert_Config
|
||||||
|
self.al = alert_config_list.Alert_Config_List()
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
alert_config_list.Alert_Config = self.was
|
||||||
|
self.al = None
|
||||||
|
|
||||||
|
def test_add(self) :
|
||||||
|
self.al.add(Mock_Alert_Config("a"))
|
||||||
|
self.assertEqual(len(self.al), 1)
|
||||||
|
self.al.add([Mock_Alert_Config("a")])
|
||||||
|
self.assertEqual(len(self.al), 1)
|
||||||
|
|
||||||
|
self.al.add([Mock_Alert_Config("b")])
|
||||||
|
self.assertEqual(len(self.al), 2)
|
||||||
|
|
||||||
|
self.al.add(Mock_Alert_Config("c"))
|
||||||
|
self.assertEqual(len(self.al), 3)
|
||||||
|
|
||||||
|
other = alert_config_list.Alert_Config_List()
|
||||||
|
other.add(Mock_Alert_Config("d"))
|
||||||
|
self.al.add(other)
|
||||||
|
self.assertEqual(len(self.al), 4)
|
||||||
|
|
||||||
|
def test_compare(self) :
|
||||||
|
self.al.add(Mock_Alert_Config("a"))
|
||||||
|
self.al.add(Mock_Alert_Config("b"))
|
||||||
|
self.al.add(Mock_Alert_Config("c"))
|
||||||
|
|
||||||
|
new = alert_config_list.Alert_Config_List()
|
||||||
|
new.add(Mock_Alert_Config("a"))
|
||||||
|
new.add(Mock_Alert_Config("y"))
|
||||||
|
new.add(Mock_Alert_Config("z"))
|
||||||
|
|
||||||
|
added, removed, modified = self.al.compare(new)
|
||||||
|
if not "y" in added or not "z" in added :
|
||||||
|
self.fail("added is missing elements")
|
||||||
|
if not "b" in removed or not "c" in removed :
|
||||||
|
self.fail("removed is missing elements")
|
||||||
|
if not "a" in modified :
|
||||||
|
self.fail("modified is missing elements")
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
34
AoM_Service/library/test_job.py
Executable file
34
AoM_Service/library/test_job.py
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
import unittest
|
||||||
|
import job
|
||||||
|
|
||||||
|
class Mock_Subprocess() :
|
||||||
|
called = False
|
||||||
|
joined = False
|
||||||
|
pid = None
|
||||||
|
def __init__(self) :
|
||||||
|
pass
|
||||||
|
|
||||||
|
def call(self, *args, **kwargs) :
|
||||||
|
self.called = True
|
||||||
|
|
||||||
|
def join(self, *args, **kwargs) :
|
||||||
|
self.joined = True
|
||||||
|
|
||||||
|
class Test_Job(unittest.TestCase):
|
||||||
|
def setUp(self) :
|
||||||
|
self.was = job.subprocess
|
||||||
|
self.subprocess = Mock_Subprocess()
|
||||||
|
job.subprocess = self.subprocess
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
job.subprocess = self.was
|
||||||
|
|
||||||
|
def test(self) :
|
||||||
|
p = Mock_Subprocess()
|
||||||
|
j = job.Job("id", p)
|
||||||
|
j.kill()
|
||||||
|
self.assertEqual(p.joined, True)
|
||||||
|
self.assertEqual(self.subprocess.called, True)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
50
AoM_Service/library/test_job_list.py
Executable file
50
AoM_Service/library/test_job_list.py
Executable file
@@ -0,0 +1,50 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
import job_list
|
||||||
|
|
||||||
|
class Mock_Job() :
|
||||||
|
def __init__(self, id, p) :
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
def kill(self) :
|
||||||
|
return
|
||||||
|
|
||||||
|
class Test_Job_List(unittest.TestCase):
|
||||||
|
def setUp(self) :
|
||||||
|
self.was = job_list.Job
|
||||||
|
job_list.Job = Mock_Job
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
job_list.Job = self.was
|
||||||
|
|
||||||
|
def test_add(self) :
|
||||||
|
jl = job_list.Job_List()
|
||||||
|
self.assertEqual(len(jl), 0)
|
||||||
|
|
||||||
|
try :
|
||||||
|
jl.add(None)
|
||||||
|
self.fail("can add nil to job_list")
|
||||||
|
except Exception :
|
||||||
|
pass
|
||||||
|
|
||||||
|
jl.add(Mock_Job("a", "a"))
|
||||||
|
self.assertEqual(len(jl), 1)
|
||||||
|
|
||||||
|
jl.add(Mock_Job("a", "a"))
|
||||||
|
self.assertEqual(len(jl), 1)
|
||||||
|
|
||||||
|
jl.add(Mock_Job("b", "b"))
|
||||||
|
self.assertEqual(len(jl), 2)
|
||||||
|
|
||||||
|
other = job_list.Job_List()
|
||||||
|
other.add(Mock_Job("b", "b"))
|
||||||
|
other.add(Mock_Job("c", "c"))
|
||||||
|
|
||||||
|
jl.add(other)
|
||||||
|
self.assertEqual(len(jl), 3)
|
||||||
|
|
||||||
|
jl.kill("a")
|
||||||
|
self.assertEqual(len(jl), 2)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
31
AoM_Service/library/test_process.py
Executable file
31
AoM_Service/library/test_process.py
Executable file
@@ -0,0 +1,31 @@
|
|||||||
|
import unittest
|
||||||
|
import process
|
||||||
|
|
||||||
|
class Mock_Multiprocessing():
|
||||||
|
def __init__(self, *args, **kwargs) :
|
||||||
|
self.args = args
|
||||||
|
self.kwargs = kwargs
|
||||||
|
|
||||||
|
def get_target(self) :
|
||||||
|
return None
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self, id) :
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
class Test_Process(unittest.TestCase):
|
||||||
|
def setUp(self) :
|
||||||
|
self.was = process.multiprocessing.Process
|
||||||
|
process.multiprocessing.Process = Mock_Multiprocessing
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
process.multiprocessing.Process = self.was
|
||||||
|
|
||||||
|
def test(self) :
|
||||||
|
class MockProcess(process.Process) :
|
||||||
|
def get_target(self) :
|
||||||
|
return None
|
||||||
|
p = MockProcess(Mock_Alert_Config("a"), {}, None, True)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
36
AoM_Service/library/test_process_factory.py
Executable file
36
AoM_Service/library/test_process_factory.py
Executable file
@@ -0,0 +1,36 @@
|
|||||||
|
import unittest
|
||||||
|
import process_factory
|
||||||
|
|
||||||
|
class Mock_Process_Prometheus() :
|
||||||
|
def __init__(self, *args, **kwargs) :
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Mock_Process_Kairos() :
|
||||||
|
def __init__(self, *args, **kwargs) :
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self, type) :
|
||||||
|
self.t = type
|
||||||
|
|
||||||
|
def type(self) :
|
||||||
|
return self.t
|
||||||
|
|
||||||
|
class Test_Process_Factory(unittest.TestCase):
|
||||||
|
def setUp(self) :
|
||||||
|
self.was_prom = process_factory.process_prometheus.Process_Prometheus
|
||||||
|
self.was_kai = process_factory.process_kairos.Process_Kairos
|
||||||
|
process_factory.process_prometheus.Process_Prometheus = Mock_Process_Prometheus
|
||||||
|
process_factory.process_kairos.Process_Kairos = Mock_Process_Kairos
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
process_factory.process_prometheus.Process_Prometheus = self.was_prom
|
||||||
|
process_factory.process_kairos.Process_Kairos = self.was_kai
|
||||||
|
|
||||||
|
def test(self) :
|
||||||
|
factory = process_factory.Process_Factory(None, None, None)
|
||||||
|
self.assertTrue(type(factory.build(Mock_Alert_Config("a"))) is Mock_Process_Kairos)
|
||||||
|
self.assertTrue(type(factory.build(Mock_Alert_Config("prometheus"))) is Mock_Process_Prometheus)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
15
AoM_Service/library/test_process_kairos.py
Executable file
15
AoM_Service/library/test_process_kairos.py
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self, id) :
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
class Test_Process_Kairos(unittest.TestCase):
|
||||||
|
def test(self) :
|
||||||
|
import process_kairos
|
||||||
|
from serviceapp import service
|
||||||
|
p = process_kairos.Process_Kairos(Mock_Alert_Config("a"), None, None, None)
|
||||||
|
self.assertEqual(p.get_target(), service.check_kairosdb_alert)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
15
AoM_Service/library/test_process_prometheus.py
Executable file
15
AoM_Service/library/test_process_prometheus.py
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
import unittest
|
||||||
|
|
||||||
|
class Mock_Alert_Config() :
|
||||||
|
def __init__(self, id) :
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
class Test_Process_Prometheus(unittest.TestCase):
|
||||||
|
def test(self) :
|
||||||
|
import process_prometheus
|
||||||
|
from serviceapp import service
|
||||||
|
p = process_prometheus.Process_Prometheus(Mock_Alert_Config("a"), None, None, None)
|
||||||
|
self.assertEqual(p.get_target(), service.check_prometheus_alert)
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
100
AoM_Service/library/test_service.py
Executable file
100
AoM_Service/library/test_service.py
Executable file
@@ -0,0 +1,100 @@
|
|||||||
|
import unittest
|
||||||
|
from serviceapp import service as serviceapp
|
||||||
|
import time
|
||||||
|
import config
|
||||||
|
import service
|
||||||
|
|
||||||
|
class Mock_ServiceApp_Service() :
|
||||||
|
def __init__(self, *args, **kwargs) :
|
||||||
|
self.args = args
|
||||||
|
self.kwargs = kwargs
|
||||||
|
|
||||||
|
def send_stat(self, *args, **kwargs) :
|
||||||
|
return
|
||||||
|
|
||||||
|
class Mock_Logger() :
|
||||||
|
def __init__(self) :
|
||||||
|
self.lines = []
|
||||||
|
self.info = self.log
|
||||||
|
self.warn = self.log
|
||||||
|
self.warning = self.log
|
||||||
|
self.debug = self.log
|
||||||
|
self.error = self.log
|
||||||
|
|
||||||
|
def log(self, *args, **kwargs) :
|
||||||
|
self.lines.append("{}, {}".format(args, kwargs))
|
||||||
|
print(self.lines[-1])
|
||||||
|
|
||||||
|
def Mock_Sleep(t) :
|
||||||
|
return
|
||||||
|
|
||||||
|
def Mock_Get_Healthy(*args, **kwargs) :
|
||||||
|
return 0, 1
|
||||||
|
|
||||||
|
def Mock_Distribute_Configs(*args, **kwargs) :
|
||||||
|
return True
|
||||||
|
|
||||||
|
def Mock_Is_Valid(*args, **kwargs) :
|
||||||
|
return True
|
||||||
|
|
||||||
|
def ignore_warnings(test_func):
|
||||||
|
import warnings
|
||||||
|
def do_test(self, *args, **kwargs):
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.simplefilter("ignore")
|
||||||
|
test_func(self, *args, **kwargs)
|
||||||
|
return do_test
|
||||||
|
|
||||||
|
class Test_Service(unittest.TestCase) :
|
||||||
|
def setUp(self) :
|
||||||
|
self.mock_serviceapp_service = Mock_ServiceApp_Service
|
||||||
|
self.was_k = serviceapp.check_kairosdb_alert
|
||||||
|
self.was_p = serviceapp.check_prometheus_alert
|
||||||
|
self.was_service = service.service
|
||||||
|
self.was_sleep = time.sleep
|
||||||
|
self.was_get_healthy = config.get_healthy_nodes_and_index
|
||||||
|
self.was_distribute = config.distribute_configs
|
||||||
|
self.was_is_valid = config.is_valid
|
||||||
|
serviceapp.check_kairosdb_alert = self.mock_serviceapp_service
|
||||||
|
serviceapp.check_prometheus_alert = self.mock_serviceapp_service
|
||||||
|
config.get_healthy_nodes_and_index = Mock_Get_Healthy
|
||||||
|
config.distribute_configs = Mock_Distribute_Configs
|
||||||
|
config.is_valid = Mock_Is_Valid
|
||||||
|
serviceapp.sleep = Mock_Sleep
|
||||||
|
service.sleep = Mock_Sleep
|
||||||
|
time.sleep = Mock_Sleep
|
||||||
|
|
||||||
|
def tearDown(self) :
|
||||||
|
serviceapp = self.was_service
|
||||||
|
serviceapp.check_kairosdb_alert = self.was_k
|
||||||
|
serviceapp.check_prometheus_alert = self.was_p
|
||||||
|
config.get_healthy_nodes_and_index = self.was_get_healthy
|
||||||
|
config.distribute_configs = self.was_distribute
|
||||||
|
config.is_valid = self.was_is_valid
|
||||||
|
time.sleep = self.was_sleep
|
||||||
|
serviceapp.sleep = self.was_sleep
|
||||||
|
service.sleep = self.was_sleep
|
||||||
|
|
||||||
|
@ignore_warnings
|
||||||
|
def test(self) :
|
||||||
|
import service
|
||||||
|
logger = Mock_Logger()
|
||||||
|
s = service.Service(logger, 100, "HOST", {
|
||||||
|
"alert_folder": "./testdata",
|
||||||
|
"alert_routing_config": {},
|
||||||
|
})
|
||||||
|
global first
|
||||||
|
first = True
|
||||||
|
def f() :
|
||||||
|
global first
|
||||||
|
is_first = first
|
||||||
|
first = False
|
||||||
|
return is_first
|
||||||
|
def purge_stale(*args) :
|
||||||
|
return
|
||||||
|
s.is_running = f
|
||||||
|
s.purge_stale = purge_stale
|
||||||
|
s.start()
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
unittest.main()
|
||||||
20
AoM_Service/library/testdata/engine.yaml
vendored
Executable file
20
AoM_Service/library/testdata/engine.yaml
vendored
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_engine_failing
|
||||||
|
service: core
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
suppressed_occurrences_threshold: 24
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['fuel']
|
||||||
18
AoM_Service/library/testdata/fuel.yaml
vendored
Executable file
18
AoM_Service/library/testdata/fuel.yaml
vendored
Executable file
@@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_fuellevel_low
|
||||||
|
service: fuel
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
20
AoM_Service/library/testdata/lightspeed.yaml
vendored
Executable file
20
AoM_Service/library/testdata/lightspeed.yaml
vendored
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
|
||||||
|
service: captain
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
start_time: '-60'
|
||||||
|
suppressed_occurrences_threshold: 48
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['core']
|
||||||
20
AoM_Service/library/testdata/shields.yaml
vendored
Executable file
20
AoM_Service/library/testdata/shields.yaml
vendored
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
id: sleeper_agents_milleniumfalcon_shields_unavailable
|
||||||
|
service: core
|
||||||
|
alerts:
|
||||||
|
slack:
|
||||||
|
- '#breel_testing_alerts'
|
||||||
|
vo:
|
||||||
|
- gobs-mm
|
||||||
|
critical_upper_threshold: 1.0
|
||||||
|
interval: 5
|
||||||
|
suppressed_occurrences_threshold: 54
|
||||||
|
start_time: '-60'
|
||||||
|
end_time: now
|
||||||
|
prometheus_url: http://big-trickster.service.eng.consul:9090
|
||||||
|
query_type: prometheus
|
||||||
|
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
|
||||||
|
tags:
|
||||||
|
- dc
|
||||||
|
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
|
||||||
|
service_dependencies: ['fuel']
|
||||||
42
AoM_Service/publish.sh
Executable file
42
AoM_Service/publish.sh
Executable file
@@ -0,0 +1,42 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
GIT_COMMIT=$(git rev-parse HEAD)
|
||||||
|
|
||||||
|
if [[ $GIT_COMMIT == "" ]]; then
|
||||||
|
echo "--Missing required GIT_COMMIT var. Aborting..."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#Setup useful vars
|
||||||
|
team="engvis"
|
||||||
|
app="alert-on-metrics-app"
|
||||||
|
|
||||||
|
registryV2="registry-app.eng.qops.net:5001"
|
||||||
|
pathV2="${registryV2}/${team}/${app}"
|
||||||
|
commitV2="${pathV2}:${GIT_COMMIT}"
|
||||||
|
latestV2="${pathV2}:latest"
|
||||||
|
|
||||||
|
# In case you use relative paths
|
||||||
|
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
|
||||||
|
cd $DIR
|
||||||
|
|
||||||
|
echo "--Publishing $app $GIT_COMMIT"
|
||||||
|
|
||||||
|
echo "--Removing old image, so they don't accumulate"
|
||||||
|
docker rmi $latestV2
|
||||||
|
|
||||||
|
#Now fail if anything doesn't work
|
||||||
|
set -e
|
||||||
|
|
||||||
|
if [ -f $app/build.sh ]
|
||||||
|
then
|
||||||
|
echo "--Running pre build steps"
|
||||||
|
$app/build.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
|
||||||
|
|
||||||
|
echo "--Publishing app container"
|
||||||
|
|
||||||
|
docker push $commitV2
|
||||||
|
docker push $latestV2
|
||||||
48
AoM_Service/qvolution.sh
Executable file
48
AoM_Service/qvolution.sh
Executable file
@@ -0,0 +1,48 @@
|
|||||||
|
function _get_and_save_secret() {
|
||||||
|
function is_set() {
|
||||||
|
local name="$1"
|
||||||
|
eval "echo \$$name" | grep . > /dev/null
|
||||||
|
}
|
||||||
|
local name="$1"
|
||||||
|
eval "$name=\${$name:-}"
|
||||||
|
if ! is_set $name; then
|
||||||
|
eval "$name=$(security find-generic-password -a $USER -s $name -w 2> /dev/null)"
|
||||||
|
if ! is_set "$name"; then
|
||||||
|
eval "read -s -p 'Enter $name: ' $name" >&2
|
||||||
|
eval "security add-generic-password -a $USER -s $name -w \$$name" >&2
|
||||||
|
echo "" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
eval "echo \$$name"
|
||||||
|
}
|
||||||
|
function get_and_save_secret() {
|
||||||
|
_get_and_save_secret "$@" | tail -n 1
|
||||||
|
}
|
||||||
|
SENSU_API_USER="$(get_and_save_secret SENSU_API_USER)"
|
||||||
|
SENSU_API_PASS="$(get_and_save_secret SENSU_API_PASS)"
|
||||||
|
SLACK_API_TOKEN="$(get_and_save_secret SLACK_API_TOKEN)"
|
||||||
|
|
||||||
|
echo SENSU_USER=$SENSU_API_USER >&2
|
||||||
|
echo SENSU_PASS=$SENSU_API_PASS >&2
|
||||||
|
echo SLACK_TOKEN=$SLACK_API_TOKEN >&2
|
||||||
|
|
||||||
|
git submodule update --remote
|
||||||
|
rm -rf alert_configs
|
||||||
|
cp -r AoM_Configs/alert_configs .
|
||||||
|
docker build -t aom:dev .
|
||||||
|
|
||||||
|
docker rm -f aom
|
||||||
|
docker run \
|
||||||
|
-e SLACK_API_TOKEN=${SLACK_API_TOKEN} \
|
||||||
|
-e API_USER=$SENSU_API_USER \
|
||||||
|
-e API_PASS=$SENSU_API_PASS \
|
||||||
|
--rm \
|
||||||
|
-d \
|
||||||
|
-p 8080:8080 \
|
||||||
|
--add-host telegraf:10.4.13.53 \
|
||||||
|
--name aom \
|
||||||
|
--add-host consul.service.consul:127.0.0.1 \
|
||||||
|
-h 127.0.0.1 \
|
||||||
|
aom:dev &
|
||||||
|
until curl localhost:8080/healthcheck; do sleep 1; done
|
||||||
|
docker logs -f aom
|
||||||
7
AoM_Service/requirements.txt
Executable file
7
AoM_Service/requirements.txt
Executable file
@@ -0,0 +1,7 @@
|
|||||||
|
PyYAML
|
||||||
|
pip
|
||||||
|
setuptools
|
||||||
|
requests
|
||||||
|
pyaml
|
||||||
|
sanic
|
||||||
|
statsd-tags
|
||||||
63
AoM_Service/run.sh
Executable file
63
AoM_Service/run.sh
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/bin/ash
|
||||||
|
|
||||||
|
(
|
||||||
|
while true; do
|
||||||
|
redis-server
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
|
) &
|
||||||
|
/usr/src/app/echo-server &
|
||||||
|
/usr/src/app/echo-server -p 443 &
|
||||||
|
/usr/src/app/consul &
|
||||||
|
|
||||||
|
# Default values
|
||||||
|
KAIROSDB_URL=${KAIROSDB_URL:-http://kairosdb-metrics.service.eng.consul:8080/}
|
||||||
|
SMTP_SERVER=${SMTP_SERVER:-internal-smtp1-app.eng.qops.net:2525}
|
||||||
|
#SENSU_URL=${SENSU_URL:-https://sensu-api.eng.qops.net:443/results}
|
||||||
|
#SLACK_TOKEN=${SLACK_TOKEN:-xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81}
|
||||||
|
#VICTOROPS_URL=${VICTOROPS_URL:-https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/}
|
||||||
|
#CONSUL_URL=${CONSUL_URL:-http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock}
|
||||||
|
#AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-https://grafana.eng.qops.net/d/000000113/alert-on-metrics?refresh=1m&orgId=1&var-dc=All&var-fqdn=All&from=now-6h&to=now&var-id=}
|
||||||
|
#UCHIWA_URL=${UCHIWA_URL:-https://uchiwa-app.eng.qops.net/#/client/EngOps/AOM}
|
||||||
|
|
||||||
|
SLACK_TOKEN=${SLACK_TOKEN:-na}
|
||||||
|
VICTOROPS_URL=${VICTOROPS_URL:-http://localhost:41912/}
|
||||||
|
CONSUL_URL=${CONSUL_URL:-http://localhost:41912/}
|
||||||
|
AOM_GRAFANA_URL=${AOM_GRAFANA_URL:-http://localhost:41912/}
|
||||||
|
UCHIWA_URL=${UCHIWA_URL:-http://localhost:41912/}
|
||||||
|
SENSU_URL=${SENSU_URL:-http://localhost:41912}
|
||||||
|
|
||||||
|
export AOM_GRAFANA_URL
|
||||||
|
|
||||||
|
# Update config
|
||||||
|
sed -i "s#{{{KAIROSDB_URL}}}#${KAIROSDB_URL}#g" service.yaml
|
||||||
|
sed -i "s#{{{VICTOROPS_URL}}}#${VICTOROPS_URL}#g" service.yaml
|
||||||
|
sed -i "s#{{{SLACK_TOKEN}}}#${SLACK_TOKEN}#g" service.yaml
|
||||||
|
sed -i "s#{{{SMTP_SERVER}}}#${SMTP_SERVER}#g" service.yaml
|
||||||
|
sed -i "s#{{{CONSUL_URL}}}#${CONSUL_URL}#g" service.yaml
|
||||||
|
sed -i "s#{{{SENSU_URL}}}#${SENSU_URL}#g" service.yaml
|
||||||
|
sed -i "s,{{{UCHIWA_URL}}},${UCHIWA_URL},g" service.yaml
|
||||||
|
# Starting service
|
||||||
|
|
||||||
|
if [ -n "${TEST}" ]; then
|
||||||
|
sed -i '/alert_reload_interval:/ s/[0-9]\+/30/g' service.yaml
|
||||||
|
python3 /usr/src/app/aom_service.py &
|
||||||
|
sleep 17
|
||||||
|
echo "Making current server leader"
|
||||||
|
curl localhost:8080/override?enable=true
|
||||||
|
echo "Starting the service"
|
||||||
|
curl localhost:8080/healthcheck
|
||||||
|
exec python3 /usr/src/app/aom_test.py
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
cat /usr/src/app/logs/aom_service.log
|
||||||
|
echo "Test failed!"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
cat /usr/src/app/logs/aom_service.log
|
||||||
|
echo "Test succeeded. Exiting"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
exec python3 /usr/src/app/reporter/incoming/main.py &
|
||||||
|
exec python3 /usr/src/app/aom_service.py
|
||||||
|
fi
|
||||||
27
AoM_Service/service.yaml
Executable file
27
AoM_Service/service.yaml
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#=======================#
|
||||||
|
# All them URLS and tokens
|
||||||
|
#=======================#
|
||||||
|
kairosdb_url: "{{{KAIROSDB_URL}}}"
|
||||||
|
victorops_url: "{{{VICTOROPS_URL}}}"
|
||||||
|
slack_url: "https://slack.com/api/chat.postMessage"
|
||||||
|
slack_token: "{{{SLACK_TOKEN}}}"
|
||||||
|
smtp_server: "{{{SMTP_SERVER}}}"
|
||||||
|
consul_url: "{{{CONSUL_URL}}}"
|
||||||
|
sensu_endpoint: "{{{SENSU_URL}}}"
|
||||||
|
uchiwa_url: "{{{UCHIWA_URL}}}"
|
||||||
|
#=======================#
|
||||||
|
# Logging Information
|
||||||
|
#=======================#
|
||||||
|
log_path: "logs/aom_service.log"
|
||||||
|
|
||||||
|
#=======================#
|
||||||
|
# alerts configurations
|
||||||
|
#=======================#
|
||||||
|
alert_folder: "alert_configs"
|
||||||
|
alert_routing_lookup: "alert_routing_lookup"
|
||||||
|
alert_reload_interval: 300
|
||||||
|
|
||||||
|
#=======================#
|
||||||
|
# request timeout value
|
||||||
|
#=======================#
|
||||||
|
timeout: 90
|
||||||
82
mock/consul/main.go
Executable file
82
mock/consul/main.go
Executable file
@@ -0,0 +1,82 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
p := os.Getenv("PORT")
|
||||||
|
flag.StringVar(&p, "p", "8500", "port to listen on")
|
||||||
|
flag.Parse()
|
||||||
|
http.Handle("/v1/catalog/service/alert-on-metrics", http.HandlerFunc(catalogService))
|
||||||
|
http.Handle("/v1/health/node/127.0.0.1", http.HandlerFunc(healthNode))
|
||||||
|
log.Println("Listening on", p)
|
||||||
|
if err := http.ListenAndServe(":"+strings.TrimPrefix(p, ":"), nil); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func healthNode(w http.ResponseWriter, r *http.Request) {
|
||||||
|
fmt.Fprintln(w, `
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"CheckID": "check_healthcheck_alert-on-metrics_alert-on-metrics",
|
||||||
|
"CreateIndex": 727094265,
|
||||||
|
"Definition": {},
|
||||||
|
"ModifyIndex": 727094265,
|
||||||
|
"Name": "Serf Health Status",
|
||||||
|
"Node": "gobs2-nomad.b1-prv.qops.net",
|
||||||
|
"Notes": "",
|
||||||
|
"Output": "Agent alive and reachable",
|
||||||
|
"ServiceID": "",
|
||||||
|
"ServiceName": "",
|
||||||
|
"ServiceTags": [],
|
||||||
|
"Status": "passing"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
`)
|
||||||
|
}
|
||||||
|
|
||||||
|
func catalogService(w http.ResponseWriter, r *http.Request) {
|
||||||
|
fmt.Fprintln(w, `
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"Address": "127.0.0.1",
|
||||||
|
"CreateIndex": 231035602,
|
||||||
|
"Datacenter": "eng",
|
||||||
|
"ID": "95dace59-f06b-d483-a06e-38288dc2019a",
|
||||||
|
"ModifyIndex": 231035602,
|
||||||
|
"Node": "127.0.0.1",
|
||||||
|
"NodeMeta": {
|
||||||
|
"consul-network-segment": ""
|
||||||
|
},
|
||||||
|
"ServiceAddress": "",
|
||||||
|
"ServiceConnect": {},
|
||||||
|
"ServiceEnableTagOverride": false,
|
||||||
|
"ServiceID": "alert-on-metrics",
|
||||||
|
"ServiceKind": "",
|
||||||
|
"ServiceMeta": {},
|
||||||
|
"ServiceName": "alert-on-metrics",
|
||||||
|
"ServicePort": 8080,
|
||||||
|
"ServiceProxy": {},
|
||||||
|
"ServiceProxyDestination": "",
|
||||||
|
"ServiceTags": [
|
||||||
|
""
|
||||||
|
],
|
||||||
|
"ServiceWeights": {
|
||||||
|
"Passing": 1,
|
||||||
|
"Warning": 1
|
||||||
|
},
|
||||||
|
"TaggedAddresses": {
|
||||||
|
"lan": "127.0.0.1",
|
||||||
|
"wan": "127.0.0.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
`)
|
||||||
|
}
|
||||||
2
sandbox/isFiringRedis/config
Executable file
2
sandbox/isFiringRedis/config
Executable file
@@ -0,0 +1,2 @@
|
|||||||
|
save
|
||||||
|
appendonly no
|
||||||
BIN
sandbox/isFiringRedis/dump.rdb
Executable file
BIN
sandbox/isFiringRedis/dump.rdb
Executable file
Binary file not shown.
12
sandbox/isFiringRedis/main.py
Executable file
12
sandbox/isFiringRedis/main.py
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
def main(args) :
|
||||||
|
import redis
|
||||||
|
redis = redis.Redis()
|
||||||
|
k = "key"
|
||||||
|
v = "value"
|
||||||
|
print(redis.get(k).decode())
|
||||||
|
redis.set(k, v)
|
||||||
|
print(redis.get(k).decode())
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
from sys import argv
|
||||||
|
main(argv)
|
||||||
15
sleeper_agents_aom_engine/.gitignore
vendored
Executable file
15
sleeper_agents_aom_engine/.gitignore
vendored
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
# Created by .ignore support plugin (hsz.mobi)
|
||||||
|
### Vagrant template
|
||||||
|
.vagrant/
|
||||||
|
.idea/
|
||||||
|
build/results
|
||||||
|
logs/
|
||||||
|
|
||||||
|
*.pyc
|
||||||
|
.dockerignore
|
||||||
|
Dockerfile
|
||||||
|
build/builder
|
||||||
|
site-packages.tar.gz
|
||||||
|
|
||||||
|
alert_configs
|
||||||
|
AoM_Configs
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user