This commit is contained in:
bel
2021-09-12 22:16:11 -06:00
commit ceeb6f0385
129 changed files with 9221 additions and 0 deletions

12
AoM_Service/AoM_Configs/.gitignore vendored Executable file
View File

@@ -0,0 +1,12 @@
# ignore alert configs starting with underscore -- we can create the while testing the webapp
# and not have to worry about them getting into the repo
alert_configs/_*.yaml
*.swp
.idea/
.vagrant/
__pycache__
logs/
venv/
.vscode/

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = 'tcp://127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "No build required"
}
}
stage('Test') {
steps {
echo "Test done already on merge request"
//sh script: 'cd build; ./test_changed.sh "${WORKSPACE_PATH}"'
// sh script: 'cd build; ./test_changed.sh'
}
}
stage('Deploy') {
steps {
script {
if ("$GIT_BRANCH" == "origin/master"){
echo "Running publish script"
sh script: './publish.sh'
echo "Triggering Rundeck job"
script {
step([$class: 'RundeckNotifier', includeRundeckLogs: true, jobId: 'c1f0dd4e-89a0-411b-afbb-455421a2ba34', nodeFilters: '', options: '', rundeckInstance: 'team-rundeck -- techops', shouldFailTheBuild: true, shouldWaitForRundeckJob: true, tags: '', tailLog: false])
}
}
else {
echo "No deploy step required."
}
}
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env groovy
pipeline {
agent {label 'nomad-builder'}
environment {
DOCKER_HOST = 'tcp://127.0.0.1:2375'
WORKSPACE_PATH = "/var/lib/nomad/alloc/${NOMAD_ALLOC_ID}/${NOMAD_TASK_NAME}${WORKSPACE}"
}
stages {
stage('Info') {
steps {
sh script: 'hostname'
echo "WORKSPACE_PATH: $WORKSPACE_PATH"
}
}
stage('Build') {
steps {
echo "No build required"
}
}
stage('Test') {
steps {
echo "Running test"
sh script: './test_changed.sh'
sh script: 'python validate_yaml.py'
}
}
stage('Deploy') {
steps {
echo "No deploy step required for Merge Request"
}
}
}
post {
success {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test passed, update commit with green checkbox
}
// Notify Eng Viz of successful build
// slackSend color: 'good', message: "Passed Build: $BUILD_URL", channel: '#eng-invisibility'
}
failure {
gitlabCommitStatus(name: "$JOB_NAME") {
// Test failed, update commit status with red x
error("Build failed, check ${BUILD_URL} for details.")
}
// On failure send an email to Eng Vis
mail body: 'Please check ${BUILD_URL} or details.',
subject: 'Jenkins job ${JOB_NAME} build #${BUILD_NUMBER} failed',
from: 'Jenkins',
to: 'eng-visibility@qualtrics.com'
// Finally send a warning message to Eng Vis slack channel
// slackSend color: 'warn', message: 'Failed Build: $BUILD_URL', channel: '#eng-invisibility'
}
}
}

View File

@@ -0,0 +1,16 @@
FROM registry-app.eng.qops.net:5001/imported/alpine:3.9
MAINTAINER Engineering Visibility <eng-visibility@qualtrics.com>
COPY webapp_requirements.txt /
COPY run_webapp.sh /
RUN apk add --no-cache python3 curl
RUN apk add --no-cache --virtual .build-deps build-base python3-dev \
&& pip3 install --no-cache-dir --upgrade pip \
&& pip3 install --no-cache-dir --upgrade setuptools \
&& pip3 install --no-cache-dir --upgrade -r /webapp_requirements.txt \
&& apk del .build-deps \
&& rm -rf /var/cache/apk/*
CMD ["/run_webapp.sh"]

236
AoM_Service/AoM_Configs/README.md Executable file
View File

@@ -0,0 +1,236 @@
# README
This is the new repository for the Alert On Metrics project configurations.
Alert On Metrics (AOM) project allows one to setup alerts to trigger based on tracking a metric value as collected via [Metrics as a Service](https://odo.corp.qualtrics.com/wiki/index.php/Metrics_As_A_Service). You "track" your metric via a [KairosDB query](http://kairosdb-metrics.service.eng.consul:8080/) or [Prometheus query](http://big-trickster.service.eng.consul:9090/graph) so you are not limited to raw metrics - you can sample based on aggregators available in KairosDB to create new metrics views or use PromQL if you are using Prometheus. Typically people use min, max or count. All "tracked" metrics are rewritten to the metrics data store as a new metric *telgraf.aom_stats_value* but are tagged by Alert-On-Metrics to show their origin.
You can trigger an alert based on any combination of the following:
- An upper critical threshold based on the value of a metric increasing
- An upper warning threshold based on the value of a metric increasing
- A lower critical threshold based on the value of a metric decreasing
- A lower warning threshold based on the value of a metric decreasing
- Combine any lower and upper threshold to create a 'band'
---
## Sensu and alert subdue. NEW!
Some changes have been introduced into latest AOM versions. Now alerts
can be sent through Sensu (email not supported yet). Using Sensu also
allows to create check dependencies (vo is now victorops for Sensu).
```
alerts:
sensu:
victorops:
'blackhole'
slack:
'#aom_test_channel'
dependencies:
- name_of_check1
- name_of_check2
```
Also filters option has been enabled. It works the same way as in
Hiera. If you only want to receive critical alerts through one channel
you can set "channel"_subdue to **true**.
Example:
```
filters:
slack_subdue: true
victorops_subdue: false
```
You can make use of anything that sensu api supports. Anything you add
to your configuration under sensu will be sent directly to the Sensu API.
---
## Availability metric.
If you want to track how long your check is on CRITICAL state along a
given period of time, you can enable this feature by setting this
option to true:
```
availability: true
```
This will start sending metrics constantly and recording the check
output. You can then visualize this metric within the following
[dashboard]
(https://grafana.eng.qops.net/d/5OsrZSdiz/aom-availability?orgId=1)
(or you can create your own).
To get a more accurate result don't set the refresh interval lower
than 60 seconds.
---
## Routing per tag value. NEW!
This feature allows you to configure a different alert routing using the values of tags in your metric. For instance, let's say you want to have a different alert policy for beta, gamma and prod:
* *beta*: I want to alert my `#my-project-dev` channel
* *gamma*: I want to alert my `#my-project-gamma` channel
* *prod*: I want to alert my `#my-project` channel and page the on-call on VictorOps
We can use the `dc` tag available in the metric query, define specific configuration for beta and gamma, and use a default one for all other values (prod in this case). Everything is configured inside the `alerts` object in the yaml configuration. Instead of directly adding the alert configuration, add a `lookup` key. Inside, you have to provide three values:
* `default`: the alert policy to apply by default if we can't find a configuration for a specific combination of tags. The format is the exact same as classic alerts (sensu, vo, slack, etc.).
* `tags`: the tags that will be used to lookup the alert routing configuration. You can use more than one tag.
* `lookups`: an array, where each element specifies a combination of tag values and the routing to apply in this case.
Here is the configuration of our example:
```yaml
alerts:
lookup:
default:
sensu:
slack: my-project
victorops: my-on-call-key
tags:
- dc
lookups:
-
alert:
sensu:
slack: my-project-dev
tags:
dc: b1-prv
-
alert:
sensu:
slack: my-project-gamma
tags:
dc: g1-iad
```
You can move the `lookups` part inside a separate file, so it can be reused accross different AOM configurations. To do that, instead of a `lookups` key, provide a `lookup_file` with the filename, including the extension:
```yaml
alerts:
lookup:
default: ...
lookup_file: my_lookup_file.yaml
tags: ...
```
Save this file under the `alert_routing_lookup` folder. The syntax for the alert routing is the same as before, it is just in a different file:
```yaml
---
-
alert:
sensu:
slack: my-project-dev
tags:
dc: b1-prv
-
alert:
sensu:
slack: my-project-gamma
tags:
dc: g1-iad
```
---
## How do I register a new alert with AOM?
Alert configurations for AOM are just a Kairos DB or Prometheus query
specified in a yaml format and wrapped in some controlling
configuration that determines how frequently the query is executed,
thresholds, occurrences and where to route the alerts. We have built a
small UI that is packaged with the AOM gitlab project that will help
you generate a suitable yaml configuration. You can rehearse your
queries on the [KairosDB UI]
(http://kairosdb-metrics.service.eng.consul:8080/) or at any
Prometheus endpoint and take a look at other examples in the alert_configs/ folder for help.
Follow the instructions below to launch the yaml generator UI on your
local desktop and use it to generate a merge request (Docker is
necessary).
1. Clone the project
2. cd into the project's directory
3. Run the script ./generate_config.sh
4. Once up, navigate in a browser to **localhost:80/**
5. Fill out the form and click generate
6. Hit **Crlt+C** when you have the alert configuration
7. Submit the merge request in a new branch
---
This process will starts a local webserver that provides a convenient interface for generating the yaml you need.
Most of the fields have helpful info tips on what each value is and how it's used.
---
## Visualization tool [BETA]
Along with the project, a simple python script to show how your
metrics will look like and to help you setting the thresholds, is
provided. This tool requires the installation of python3 and some
additional python3 modules:
1. yaml
2. json
3. requests
4. numpy
5. matplotlib
These modules should be easy to install using 'pip' or 'homebrew'.
Usage:
```python3 show_config.py [X] alertname_without_yaml_extension```
Where X is an optional parameter to define the interval lenght you
want to display. It's a multiplier factor, set to 10 by default, that
will increase the start_relative (so you will see more datapoints).
The script should open a window showing the metrics along the defined
thresholds. If the query doesn't return any value, it will exit.
---
## How does my new alert get to production?
Once you submit a merge request, a Jenkins' job will quickly validate your alert
files just checking it contains all required fields and proper syntax. Setting up
appropriate thresholds and alerting channels (VictorOps, email,
Slack) is user's responsibility.
If Jenkins returns a PASS result for the test, new alert files will be
merged into the master branch and a deploy job will be triggered (also
from Jenkins). AOM service will be actively looking for changes in the
alert_configs folder and will pick up any changes (by default every
300 seconds).
## Helpful Tidbits
__IMPORTANT:__ The alert id field must be unique, it might be useful running the
grep command within the alert_configs directory to ensure it's not
already defined.
Use the [UI](http://kairosdb-metrics.service.eng.consul:8080/) on the kairosdb box to help you generate / determine the proper query.
Remember, you want to get the query down to just one or 2 entries per *group-by* so that the service can quickly iterate over it.
Once the request has been merged you can check if your query is getting processed by [hitting the url](http://alert-on-metrics.service.eng.consul:8080/healthcheck?verbose=true)
You can also check out the [grafana dashboard](http://grafana-metrics.service.eng.consul:3000/dashboard/db/alert-on-metrics) that has the results of this service's queries and verify your alert metric is showing up regularly.
From KairosDB's doc: *You must specify either start_absolute or start_relative but not
both. Similarly, you may specify either end_absolute or end_relative
but not both. If either end time is not specified the current date and
time is assumed.* We suggest the usage of *end_relative* (greater than
1 minute) as this will make steadier graphs (if you draw a graph until
*Now*, some of the latest metrics could be missing so the end of the
graph will be lower than it should).
We do not recommend using *align_sampling* and *align_start_time*
(both false by default so can be skipped) as they might change the alignment of metrics
and change graphs over time (*If more than one are set, unexpected results will occur*).
If you have any doubt about KairosDB's query metrics you can take a look at their documentation [here](https://kairosdb.github.io/docs/build/html/restapi/QueryMetrics.html).
---
## The Gotchas
1. Alerts only fire when KairosDB returns a result. If your KairosDB metric query returns no results for X (currently 10) attempts any active alerts will clear with a message explaining that AOM could not get any further results from KairosDB so user must manually verify RECOVERY. Earlier versions of AOM had no flap protection like this built in. Long term we will move alerting to Sensu which has more advanced built in flap protection. You can reduce flapping of results by building your Kairos query well. Please talk to engineering visibility for help with this.
2. Metrics are only collected every 60 seconds, so setting an interval below that will automatically get bumped up to 60 seconds from the web based config generation. Match up the interval by how often the metric is collected and measured
3. The Email field only requires a list of names, and not the @qualtrics bit, as it will only send to qualtrics addresses using the internal-smtp1-app.eng.qops.net box
4. Email and Slack alerts fire once during an event. This way if an outtage was occuring, you wouldn't get flooded with emails and slack alerts the entire time.
5. Email and Slack alerts can be helpful to share with the team so they are aware of what is happening.
6. Email and Slack alerts can be helpful when trying to figure out your alerts before you VO stuff

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_engine_failing
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 24
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_engine_failing) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

View File

@@ -0,0 +1,18 @@
---
id: sleeper_agents_milleniumfalcon_fuellevel_low
service: fuel
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_fuellevel_low) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_lightspeed_unavailable
service: captain
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
start_time: '-60'
suppressed_occurrences_threshold: 48
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_lightspeed_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['core']

View File

@@ -0,0 +1,20 @@
---
id: sleeper_agents_milleniumfalcon_shields_unavailable
service: core
alerts:
slack:
- '#breel_testing_alerts'
vo:
- gobs-mm
critical_upper_threshold: 1.0
interval: 5
suppressed_occurrences_threshold: 54
start_time: '-60'
end_time: now
prometheus_url: http://big-trickster.service.eng.consul:9090
query_type: prometheus
query: max(sleeper_agents_milleniumfalcon_shields_unavailable) by (dc)
tags:
- dc
url: https://grafana.eng.qops.net/d/000000390/geni?orgId=1
service_dependencies: ['fuel']

View File

@@ -0,0 +1,8 @@
---
-
alert:
slack:
- "public-api-deploy-tst"
tags:
canaryTest: transaction_import_distribution_1
targetdc: fra1

View File

@@ -0,0 +1,365 @@
---
-
alert:
sensu:
slack: es-qe-alerts
tags:
brandId: aexpfeedback
-
alert:
sensu:
slack: emea-alerts
victorops: profserv-19
tags:
brandId: airbuswea
-
alert:
sensu:
slack: es-alaskaair
tags:
brandId: alaskaair
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: amdocs
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: americanairlines
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: anz
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: arris
-
alert:
sensu:
slack: emea-alerts
victorops: profserv-19
tags:
brandId: baincx
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: bmocx
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwgroupne
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwgroupnest3
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwjapan
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwjapanst3
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwna
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwnast3
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwvertriebsgmbh
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: bmwvertriebsgmbhst3
-
alert:
sensu:
slack: caterpillar
victorops: profserv-14
tags:
brandId: catcustomerinsights
-
alert:
sensu:
slack: century-link
victorops: xmp-seattle-4
tags:
brandId: centurylink
-
alert:
sensu:
slack: xmp-seattle-4
victorops: xmp-seattle-4
tags:
brandId: ciscoengineering
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: clientdashboards
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: cms
-
alert:
sensu:
slack: TODO
tags:
brandId: cocacolaperform
-
alert:
sensu:
slack: dish
tags:
brandId: dishvoc
-
alert:
sensu:
slack: es-alerts
tags:
brandId: dowcorning
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: drtoddhall
-
alert:
sensu:
slack: es-gs-compare
victorops: xmp-seattle-3
tags:
brandId: goldmansachs
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: harvard
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: ibm
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: jcibuildings
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: johnsoncontrols2
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: kubota
-
alert:
sensu:
slack: liberty-mutual
tags:
brandId: libertymutualvoc
-
alert:
sensu:
slack: es-qe-alerts
victorops: es-bmw-marriott
tags:
brandId: marriottvacationclub
-
alert:
sensu:
slack: es-alerts
tags:
brandId: mastercard
-
alert:
sensu:
slack: xmp-seattle-4
victorops: xmp-seattle-4
tags:
brandId: nielsenapac
-
alert:
sensu:
slack: TODO
tags:
brandId: optumrx
-
alert:
sensu:
slack: xmp-seattle-4
victorops: xmp-seattle-4
tags:
brandId: nielsenscarborough
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: rogers
-
alert:
sensu:
slack: es-alerts
tags:
brandId: samsungeurope
-
alert:
sensu:
slack: emea-alerts
victorops: profserv-19
tags:
brandId: telenorreporting
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: thermoking
-
alert:
sensu:
slack: philips-es
tags:
brandId: tnsnipophilips
-
alert:
sensu:
slack: travelers_coord
victorops: profserv-14
tags:
brandId: travelers
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhcdr
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhcmr
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhcgm
-
alert:
sensu:
slack: TODO
tags:
brandId: uhg
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: uhg1
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: underarmour
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: unum
-
alert:
sensu:
slack: TODO
tags:
brandId: usaast3
-
alert:
sensu:
slack: xmp-sea-automations
tags:
brandId: usbank
-
alert:
sensu:
slack: es-alerts
victorops: profserv
tags:
brandId: uscd
-
alert:
sensu:
slack: xmp-seattle-3
victorops: xmp-seattle-3
tags:
brandId: walkersandbox

View File

@@ -0,0 +1,30 @@
#! /usr/bin/python3
# aom_builder.py
# point of the builder is to generate a valid yaml config that could be read in to the main app by
# asking for clarifying questions on what to check and how to alert on it
# this comes from 4 questions:
# When to query
# What to query for
# Whats an alert
# Who to Alert
from webapp import app
from library.logger import AlertLogging
from library.args import get_builder_args
log = AlertLogging('aom')
log.start()
log.start_log_file("logs/aom_builder.log")
if __name__ == "__main__":
# GET ARGS AND START LOGGING
args = get_builder_args()
# logger.init("logs/aom_builder.log", args['log_level'])
# aom_logger = logging.getLogger(__name__)
log.info("Logger Initialized")
# ENABLE SESSIONS TO KEEP YAML FILE STATE BETWEEN PAGES
log.info("Starting webapp")
app.run(host='localhost', port=args['port'], debug=True)

View File

@@ -0,0 +1,16 @@
#!/bin/bash
trap ctrl_c INT
function ctrl_c() {
docker stop aom_web
docker ps -a | awk '{ print $1,$2 }' | grep aom_web | awk '{print $1 }' | xargs -I {} docker rm {}
}
docker build -f Dockerfile.webapp -t aom_web . && \
docker run -d -v$(pwd):/web -p80:5000 --name aom_web aom_web && \
docker logs -f aom_web

View File

View File

@@ -0,0 +1,84 @@
# Contians the arg parser options.
import argparse
import sys
def get_builder_args():
"""
Gets the arguments passed in to the aom_builder main call
:return: parser object
"""
parser = argparse.ArgumentParser(description="Generates a valid yaml file for alerting on metrics. "
"If you are familiar with the yaml structure for an alert"
"you don't have to use this builder, it's just convenient")
parser.add_argument('-q', '--query', help="The Kariosdb query string to use")
parser.add_argument('-i', '--interval', type=int, default=60, help="The interval that the check will run. "
"This value is in seconds")
parser.add_argument('-t', '--threshold', '--upperthreshold', help="The upper threshold is the value that when reached will cause an alert "
"depending on the threshold logic. "
"Use in conjunction with lower threshold to define a normal band.")
parser.add_argument('-b', '--lowerthreshold', help="The lower threshold is the value that when reached will cause an alert "
"depending on the threshold logic"
"Use in conjunction with upper threshold to define a normal band.")
parser.add_argument('-m', '--measure', choices=['gt', 'lt', 'eq'], help="The measure to use to compare the "
"threshold to the values of the alerts")
parser.add_argument('-a', '--alert_config', help='A valid Yaml representation of your alerting block')
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_builder run. "
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
return args_to_dict(parser)
def get_tester_service_args():
"""
Gets arguments passed into aom_tester.py
Returns: parser object
"""
parser = argparse.ArgumentParser(description="Parameters to start the alerting on metrics dummy tester service")
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument('-a', '--alert_configs', default=None,
help="If provided will override the folder location read from the config with the value passed "
"in. Is helpful for testing and troubleshooting alerts")
parser.add_argument('--hostname', help="If provided, will override the actual hostname check with this value")
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
return args_to_dict(parser)
def get_service_args():
"""
Gets arguments passed into aom_service.py
Returns: parser object
"""
parser = argparse.ArgumentParser(description="Parameters to start the alerting on metrics service")
parser.add_argument('-l', '--log_level', type=int, default=0, help="The log level for the aom_service app"
"[0=Error, 1=Info, 2=Debug]")
parser.add_argument('-a', '--alert_configs', default=None,
help="If provided will override the folder location read from the config with the value passed "
"in. Is helpful for testing and troubleshooting alerts")
parser.add_argument('-o', '--override', action='store_true', help="Overrides the check leader election value")
parser.add_argument('--hostname', help="If provided, will override the actual hostname check with this value")
parser.add_argument('-p', '--port', type=int, default=8080, help="The port to run the webapp on")
return args_to_dict(parser)
def args_to_dict(parsed_args):
"""
Converts the argument parser object to a dict
Args:
parsed_args: Arg parser object
Returns:
Dictionary of arguments
"""
try:
arg_list = parsed_args.parse_args()
# RETURN A DICT OF ARGUMENTS
arg_dict = dict()
for val in vars(arg_list):
arg_dict[val] = getattr(arg_list, val)
return arg_dict
except argparse.ArgumentError:
parsed_args.print_help()
sys.exit(1)

View File

@@ -0,0 +1,22 @@
# config.py
import logging
import glob
import yaml
logger = logging.getLogger(__name__)
def glob_the_configs(config_path):
"""
Args:
config_path (string): relative path to the configs
Returns:
List of configs
"""
alert_list = []
for config_file in glob.glob(config_path + "/*.yaml"):
logger.debug("Found {} config".format(config_file))
# LOAD CONFIG
alert_list.append(yaml.load(open(config_file, 'rb').read()))
logger.info("Loaded {} configs".format(len(alert_list)))
return alert_list

View File

@@ -0,0 +1,118 @@
# logger.py
import logging
import logging.handlers
import os
logging.getLogger('requests').setLevel(logging.ERROR)
logging.getLogger('urllib3').setLevel(logging.ERROR)
logging.getLogger('werkzeug').setLevel(logging.ERROR)
class SingleLevelFilter(logging.Filter):
def __init__(self, passlevel, reject):
"""
initilizer(constructor) of the singlelevelfilter
@param passlevel (int) - the int value of the level of the log
@param reject (bool) - if true will return if the record level is not equal to the passlevel
@return SingleLevelFilter object
@note Sets some object parameters
"""
self.passlevel = passlevel
self.reject = reject
def filter(self, record):
"""
Returns True/False depending on parameters
@param record (Log int) - the record that the filter belongs to
@return bool - True/False depending on what self.reject is set to and what record.levelno and self.passlevel are set to
@note This causes either only logging of the exact same level to get logged, or only logging other than the same level to get logged
"""
if self.reject:
return (record.levelno != self.passlevel)
else:
return (record.levelno == self.passlevel)
class AlertLogging(logging.Logger):
"""
Class Object to handle the logging of the alert on metrics service
starts at Error level and can flip on (and add) an additional log file and
Debug logger as needed.
"""
def __init__(self, name):
"""
Inits the formaters and logger
"""
self.name = name
self.debug_formatter = logging.Formatter(
"%(asctime)s - [%(levelname)s] - [%(module)s:%(lineno)d] - %(message)s", "%m-%d %H:%M:%S")
self.standard_formatter = logging.Formatter("%(asctime)s - [%(levelname)s] - %(message)s",
"%m-%d %H:%M:%S")
logging.getLogger()
logging.Logger.__init__(self, name, logging.DEBUG)
logging.setLoggerClass(AlertLogging)
def start(self):
"""
Returns:
"""
info_handler = logging.StreamHandler()
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(self.standard_formatter)
self.addHandler(info_handler)
return self
def start_log_file(self, file_path, mode='a'):
"""
Creates a separate log file handler
Args:
file_path: path to the log file
mode: the type of mode to open the file handler with
Returns:
"""
self.log_path = file_path
work_folder = os.path.dirname(file_path)
if len(work_folder) > 0 and not os.path.exists(work_folder):
os.makedirs(work_folder)
self.log_handler = logging.FileHandler(file_path, mode)
self.log_handler.setLevel(logging.DEBUG)
self.log_handler.setFormatter(self.debug_formatter)
self.addHandler(self.log_handler)
def stop_log_file(self):
"""
Closes Log file and sets the handler to None
Returns:
"""
self.log_handler.close()
self.removeHandler(self.log_handler)
self.log_handler = None
def start_debug(self):
"""
Returns:
"""
self.debug_handler = logging.StreamHandler()
self.debug_handler.setLevel(logging.DEBUG)
self.debug_handler.addFilter(SingleLevelFilter(logging.DEBUG, False))
self.debug_handler.setFormatter(self.debug_formatter)
self.addHandler(self.debug_handler)
def stop_debug(self):
"""
stop the debugger
Returns:
"""
self.removeHandler(self.debug_handler)
self.debug_handler = None

View File

@@ -0,0 +1,42 @@
#!/bin/bash
GIT_COMMIT=$(git rev-parse HEAD)
if [[ $GIT_COMMIT == "" ]]; then
echo "--Missing required GIT_COMMIT var. Aborting..."
exit 1
fi
#Setup useful vars
team="engvis"
app="alert-on-metrics-configs"
registryV2="registry-app.eng.qops.net:5001"
pathV2="${registryV2}/${team}/${app}"
commitV2="${pathV2}:${GIT_COMMIT}"
latestV2="${pathV2}:latest"
# In case you use relative paths
DIR=$(cd $(dirname $BASH_SOURCE[0]) && pwd)
cd $DIR
echo "--Publishing $app $GIT_COMMIT"
echo "--Removing old image, so they don't accumulate"
docker rmi $latestV2
#Now fail if anything doesn't work
set -e
if [ -f $app/build.sh ]
then
echo "--Running pre build steps"
$app/build.sh
fi
docker build --pull=true --tag="$commitV2" --tag "$latestV2" .
echo "--Publishing app container"
docker push $commitV2
docker push $latestV2

6
AoM_Service/AoM_Configs/run.sh Executable file
View File

@@ -0,0 +1,6 @@
#!/bin/sh
rsync -a --delete /alert_configs/ /mountpoint/configs/git/
rsync -a --delete /alert_routing_lookup/ /mountpoint/alert_routing_lookup/
ls -l /mountpoint/configs/git/

View File

@@ -0,0 +1,5 @@
#!/bin/ash
export FLASK_APP=/web/aom_webapp.py
export FLASK_DEBUG=1
cd /web; flask run --host=0.0.0.0

View File

@@ -0,0 +1,25 @@
#=======================#
# All them URLS and tokens
#=======================#
kairosdb_url: "http://kairosdb-metrics.service.eng.consul:8080/"
victorops_url: "https://alert.victorops.com/integrations/generic/20131114/alert/07f108fe-9183-45c3-a888-19e1432806c5/"
slack_url: "https://slack.com/api/chat.postMessage"
slack_token: "xoxb-76976722775-WY6vtKAk0SQEb8qcbFkLMV81"
smtp_server: "internal-smtp1-app.eng.qops.net:2525"
consul_url: "http://consul1-app.eng.qops.net:8500/v1/kv/service/alert-on-metrics/leader-lock"
sensu_endpoint: "https://sensu-api.eng.qops.net:443/results"
#=======================#
# Logging Information
#=======================#
log_path: "logs/aom_service.log"
#=======================#
# alerts folder
#=======================#
alert_folder: "alert_configs"
#=======================#
# request timeout value
#=======================#
timeout: 90

View File

@@ -0,0 +1,104 @@
import glob
import yaml
import json
import os
import sys
import time
import re
import requests
import numpy
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
import random
import warnings
warnings.filterwarnings("ignore")
#from pdb import set_trace as bp
timeout = 180
# if no argument print help and exit
if len(sys.argv) == 1:
print("You need to specify an alert config file.")
exit(1)
#else
config_file = 'alert_configs/'+sys.argv[1]+'.yaml'
# test file exists or exit
alert_config = yaml.load(open(config_file, 'rb').read())
# We will show 10 intervals by default
if len(sys.argv) == 3:
interval = int(sys.argv[2])
else:
interval = 10
alert_config['query']['start_relative']['value'] = str(int(alert_config['query']['start_relative']['value'])*interval)
kairosdb_url = "http://kairosdb-metrics.service.eng.consul:8080/"
query_url = os.path.join(kairosdb_url + "api/v1/datapoints/query")
#ret = requests.post(query_url, data=json.dumps(query), timeout)
ret = requests.post(query_url, json.dumps(alert_config['query']), timeout)
results = ret.json()['queries'][0]['results']
# Transforming to human readable data
# for result in results[0]['values']:
# result[0] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(result[0]/1000))
# result[0] = datetime.datetime.strptime(result[0],'%Y-%m-%d %H:%M:%S')
for result in results:
for value in result['values']:
# bp()
# transform date from epoch to human readable format
value[0] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(value[0]/1000))
# transform date string to datetime object
value[0] = datetime.datetime.strptime(value[0],'%Y-%m-%d %H:%M:%S')
series = numpy.array(result['values'])
label_str = str(result['group_by'][0].get('group', ''))
line_color = tuple(numpy.random.random(size=3))
plt.plot_date(series[:,0],series[:,1], marker='.', color=line_color, linestyle='-', label=label_str)
#series = numpy.array(results[0]['values'])
#converted_dates = map(datetime.datetime.strptime, datelist, len(datelist)*['%Y-%m-%d %H:%M:%S'])
#x_axis = (converted_dates)
formatter = mdates.DateFormatter('%H:%M:%S')
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
# series = series.astype(numpy.unicode, copy=False)
ax = plt.subplot()
#ax.set_xlabel('TIME')
#ax.set_ylabel('VALUE')
#bc = plt.axes()
#bc.xaxis.set_major_formatter(formatter)
#plt.plot_date(series[:,0],series[:,1], marker='o', color='b', linestyle='-')
#plt.plot_date(converted_dates,series[:,1], marker='o', color='b', linestyle='-')
#ax.set_xticks(series[:,0])
#ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
#ax.xaxis.set_minor_formatter(mdates.DateFormatter("%Y-%m"))
# ax = plt.subplot.gcf().axes[0]
#ax.set_title(sys.argv[1])
ax.xaxis.set_major_formatter(formatter)
#plt.xaxis.set_major_formatter(formatter)
plt.title(sys.argv[1])
plt.legend()
# pyplot.gcf().autofmt_xdate(rotation=25)
#ax.xaxis_date()
# ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
# ax.xaxis.set_minor_formatter(mdates.DateFormatter("%Y-%m"))
# ax.plot(series[:,0],series[:,1], marker='o', color='b', linestyle='-')
myRe = re.compile('^(?!occurrences).*_threshold$')
# Adding thresholds to the graph
for key in alert_config:
if myRe.match(key):
plt.axhline(y=float(alert_config[key]), color='r', linestyle='--', label=str(key))
plt.text(series[0][0],float(alert_config[key]),key)
#plt.gcf().autofmt_xdate()
#ax = .add_axes([0,0,1,1])
plt.gcf().autofmt_xdate(rotation=25)
#plt.axhline(y=500000, color='o', linestyle='-')
plt.show()
#results[0]['values']

View File

@@ -0,0 +1,30 @@
#!/bin/bash
set -x
if [ -z $GIT_COMMIT ]; then
echo "Expected env var 'GIT_COMMIT' to be set. Exiting..."
exit 1
fi
echo "Check that only alert confings are being pushed"
echo "$PWD"
for file in $(git diff-tree -r --name-only ${GIT_COMMIT}^1 ${GIT_COMMIT}); do
new_id=$(grep ^id\: $file)
if [ ! -z "$new_id" ]; then
total_id=$(grep "$new_id" alert_configs/*.yaml | wc -l)
if [ $total_id -gt 1 ] ; then
echo "Duplicated id found! Please update the id of the alert configuration"
exit 1
fi
fi
dir=$(dirname ${file})
# alert_configs/ change triggers a test of the new or changed aler configs
if [ "$dir" == "alert_configs" ] || [ "$dir" == "alert_routing_lookup" ] ; then
echo "Good to merge"
exit 0
else
echo "Only automatic merges allowed for alert config files"
exit 1
fi
done

View File

@@ -0,0 +1,60 @@
import yaml
import glob
if __name__ == "__main__":
alert_list = []
bad_alert_list = []
print("Collecting all yaml configs")
# COLLECT CONFIG FILES
for config_file in glob.glob("./alert_configs/*.yaml"):
print("Found {} config".format(config_file))
alert_list.append(config_file)
print("Collecting all yaml configs")
# PARSE CONFIG FILES AND VALIDATE THEIR VALUES
for alert in alert_list:
print("Validating file {}".format(alert))
try:
config = yaml.load(open(alert, 'rb').read())
assert len(config['alerts']) > 0, "No Alerts configured, this is a dead config"
assert len(config['query']) > 0, "No Query, this is a dead config"
assert config['interval'] >= 30, "Intervals less than 30 are invalid"
assert len(config['id']) > 0, "Alert ID is empty, this is a dead config"
if config.get('query_type') == 'prometheus':
assert type(config['query']) is str, "Invalid Prometheus query"
assert "$" not in config['query'], "Prometheus query should not contain variables"
else:
assert type(config['query']) is dict, "Kairosdb Query string cannot be validated as proper JSON"
defined_tags = set(config['query']['metrics'][0]['tags'].keys()).union({'','dc','fqdn'})
# IF THERE IS AGGREGATION WE HAVE TO ADD THESE TAGS
if 'group_by' in config['query']['metrics'][0]:
defined_tags.update(set(config['query']['metrics'][0]['group_by'][0]['tags']))
# for undefined_tag in set(config['tags']).difference(defined_tags):
# print("WARNING! {} tag is not defined on the query. Please make sure it does exist to "\
# "prevent empty results".format(undefined_tag))
# OUR MINIMUM THRESHOLD NEED
assert 'critical_lower_threshold' in config or 'critical_upper_threshold' in config or \
'warning_lower_threshold' in config or 'warning_upper_threshold' in config, \
"Config must have at least one threshold set."
# JUST MAKE SURE YOU ARE NOT DOING SOMETHING STUPID WITH WARNING COMING AFTER CRITICAL
if 'warning_lower_threshold' in config and 'critical_lower_threshold' in config:
assert config['critical_lower_threshold'] < config['warning_lower_threshold'], \
"Lower Critical must be less than Lower Warning"
if 'warning_upper_threshold' in config and 'critical_upper_threshold' in config:
assert config['critical_upper_threshold'] > config['warning_upper_threshold'], \
"Upper Critical must be greater than Upper Warning"
if 'occurrences_threshold' in config:
assert config['occurrences_threshold'] >= 1, \
"Having an occurrences value less than 2 is assumed and pointless to specify"
except Exception as e:
print("Invalid config file: {}\n{}".format(alert, str(e)))
bad_alert_list.append("{}\n{}".format(alert, str(e)))
# WRITE OUT BAD CONFIGS TO THE RESULTS FILE
# with open("./results/test_results.log", "w+") as f:
# for alert in bad_alert_list:
# f.write("Config is bad: {}".format(alert.replace('\n', ' ')))
for alert in bad_alert_list:
print("Config is bad: {}".format(alert.replace('\n', ' ')))
if bad_alert_list:
exit(1)

View File

@@ -0,0 +1,7 @@
from flask import Flask, render_template, request, session
app = Flask(__name__)
app.config['SESSION_TYPE'] = 'filesystem'
app.config['SECRET_KEY'] = 'super secret key'
import webapp.views

View File

@@ -0,0 +1,139 @@
import yaml
import os
import json
import traceback
import sys
from library.logger import AlertLogging
logger = AlertLogging('aom')
logger.start()
def render_config(config):
"""
Reads in the config dict and renders to file. config usually from web interface
Args:
config: The config to use to generate the yaml file
Returns:
boolean string of 0 if successful and the yaml as string, or 1 and the error
"""
try:
# GET THE NAME OF THE FILE FROM THE CONFIG
file_name = ''.join([config['alert_name'], '.yaml'])
logger.debug("Filename: {}".format(file_name))
# THIS SHOULD BE A PARAMETER PASSED IN
file_path = os.path.join('alert_configs', file_name)
logger.debug("Full path: {}".format(file_path))
# SANITIZE THE CONFIG TO A NEW OBJECT
yaml_config = {'alerts': {},
'id': config['alert_name'],
'interval': 30 if int(config['interval']) < 30 else int(config['interval'])}
# SET THE INTERVAL TO lowest value of 30 seconds
# SPLIT THE ALERTS INTO A LIST
if 'vo' in config:
yaml_config['alerts']['vo'] = [x for x in config['vo_list'].split(',') if x]
if 'email' in config:
yaml_config['alerts']['email'] = [x for x in config['email_list'].split(',') if x]
if 'slack' in config:
yaml_config['alerts']['slack'] = [x for x in config['slack_list'].split(',') if x]
# GET THRESHOLDS AS FLOATS
if 'critical_threshold' in config:
if config['critical_upper_threshold'] is not "":
yaml_config['critical_upper_threshold'] = float(config['critical_threshold'])
if 'critical_upper_threshold' in config:
if config['critical_upper_threshold'] is not "":
yaml_config['critical_upper_threshold'] = float(config['critical_upper_threshold'])
if 'warning_threshold' in config:
yaml_config['warning_upper_threshold'] = float(config['warning_threshold'])
if 'warning_upper_threshold' in config:
yaml_config['warning_upper_threshold'] = float(config['warning_upper_threshold'])
if 'critical_lower_threshold' in config:
if config['critical_lower_threshold'] is not "":
yaml_config['critical_lower_threshold'] = float(config['critical_lower_threshold'])
if 'warning_lower_threshold' in config:
yaml_config['warning_lower_threshold'] = float(config['warning_lower_threshold'])
if 'occurrences' in config:
yaml_config['occurrences_threshold'] = int(config['occurrences_threshold'])
# PARSE THE QUERY OUT INTO A DICT OBJECT
if config['prometheus_query']:
yaml_config['query_type'] = 'prometheus'
yaml_config['prometheus_url'] = config['prometheus_url']
yaml_config['query'] = config['prometheus_query']
yaml_config['start_time'] = config['start_time']
yaml_config['end_time'] = config['end_time']
else:
yaml_config['query_type'] = 'kairosdb'
yaml_config['query'] = json.loads(config['kairosdb_query'])
# GET THE TAGS, COMMA SEPARATED
tags = config['tags'].split(',')
yaml_config['tags'] = [x for x in tags if x]
# GET THE URL
yaml_config['url'] = config['url']
# WRITE TO FILE
yaml_str = yaml.dump(yaml_config, default_flow_style=False, explicit_start=True)
with open(file_path, 'w') as f:
f.write(yaml_str)
return 0, yaml_str
except json.decoder.JSONDecodeError:
return 1, "Query string is not valid json: {}".format(traceback.format_stack())
except Exception as e:
logger.error("Unable to render yaml config file to disk")
_, _, ex_traceback = sys.exc_info()
return 1, render_traceback(e, ex_traceback)
def render_yaml(alert_id):
"""
Reads in a yaml file into the config that the web expects.
Args:
alert_id: then name of the config
Returns:
Dictionary
"""
file_name = ''.join([alert_id, '.yaml'])
file_path = os.path.join('alert_configs', file_name)
config = yaml.load(open(file_path, 'r').read())
yaml_config = dict()
yaml_config['alert_name'] = config['id']
yaml_config['interval'] = config['interval']
if 'critical_threshold' in config:
yaml_config['critical_upper_threshold'] = config['critical_threshold']
if 'critical_upper_threshold' in config:
yaml_config['critical_upper_threshold'] = config['critical_upper_threshold']
if 'critical_lower_threshold' in config:
yaml_config['critical_lower_threshold'] = config['critical_lower_threshold']
if 'warning_threshold' in config:
yaml_config['warning_upper_threshold'] = config['warning_threshold']
if 'warning_upper_threshold' in config:
yaml_config['warning_upper_threshold'] = config['warning_upper_threshold']
if 'warning_lower_threshold' in config:
yaml_config['warning_lower_threshold'] = config['warning_lower_threshold']
if 'occurrences_threshold' in config:
yaml_config['occurrences_threshold'] = config['occurrences_threshold']
yaml_config['url'] = config['url']
if 'email' in config['alerts']:
yaml_config['email'] = 'on'
yaml_config['email_list'] = ','.join(config['alerts']['email'])
if 'vo' in config['alerts']:
yaml_config['vo'] = 'on'
yaml_config['vo_list'] = ','.join(config['alerts']['vo'])
if 'slack' in config['alerts']:
yaml_config['slack'] = 'on'
yaml_config['slack_list'] = ','.join(config['alerts']['slack'])
if 'tags' in config:
yaml_config['tags'] = ','.join(config['tags'])
if config.get('query_type') == 'prometheus':
yaml_config['prometheus_query'] = config['query']
yaml_config['prometheus_url'] = config['prometheus_url']
yaml_config['start_time'] = config['start_time']
yaml_config['end_time'] = config['end_time']
else:
yaml_config['kairosdb_query'] = json.dumps(config['query'], sort_keys=True, indent=4, separators=(',', ': '))
return yaml_config
def render_traceback(ex, ex_traceback):
tb_lines = traceback.format_exception(ex.__class__, ex, ex_traceback)
logger.exception("Exception")
return '\n'.join(tb_lines)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,29 @@
body { font-family: sans-serif; background: #eee; }
a, h1, h2 { color: #377BA8; }
h1, h2 { font-family: 'Georgia', serif; margin: 0; }
h1 { border-bottom: 2px solid #eee; }
h2 { font-size: 1.2em; }
.page { margin: 2em auto; width: 45em; border: 5px solid #ccc;
padding: 0.8em; background: white; }
.entries { list-style: none; margin: 0; padding: 0; }
.entries li { margin: 0.8em 1.2em; }
.entries li h2 { margin-left: -1em; }
.add-entry { font-size: 0.9em; border-bottom: 1px solid #ccc; }
.add-entry dl { font-weight: bold; }
.metanav { text-align: right; font-size: 0.8em; padding: 0.3em;
margin-bottom: 1em; background: #fafafa; }
.flash { background: #CEE5F5; padding: 0.5em;
border: 1px solid #AACBE2; }
.error { background: #F0D6D6; padding: 0.5em; }
/#.button { border-top: 2px solid #a3ceda;
border-left: 2px solid #a3ceda;
border-right: 2px solid #4f6267;
border-bottom: 2px solid #4F6267;
padding: 1px 20px !important;
font-size: 14px !important;
background-color: #CEE5F5;
font-weight: bold;
color: #2d525d; }
#/
.container { width: 500px; clear: both;}

View File

@@ -0,0 +1,28 @@
{% extends "header.html" %}
{% block body %}
<h2>Form Elements</h2><br />
<table>
{% for key, value in query.items() %}
<tr>
<th> {{ key }} </th>
<td> {{ value }} </td>
</tr>
{% endfor %}
</table><br/>
<p>
{{ query.alert_name }}
</p>
<h2>Rendered Config File</h2><br />
<p>{{ file_path }}</p>
<p>
{% for line in file_contents %}
<div>{{ line|safe }}</div>
{% endfor %}
</p>
<br />
<form action="{{ url_for('re_build', alert_id=query.alert_name) }}" id="re_build" method="post">
<p>
<input type="submit" id="submit" class="btn btn-primary" value="Return to Form?">
</p>
</form>
{% endblock %}

View File

@@ -0,0 +1,6 @@
{% extends "header.html" %}
{% block body %}
<h1>Error Rendering config:</h1>
<p>{{ message }}</p>
<p><a href="{{ url_for('index') }}">Return to Creation Page?</a></p>
{% endblock %}

View File

@@ -0,0 +1,67 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-COMPATIBLE" content="IE=edge">
<meta name="viewport" content="width=device-width, intial-scale=1">
<title>Alerting On Metrics Yaml Builder</title>
<link rel=stylesheet type=text/css href="{{ url_for('static', filename='bootstrap.min.css') }}">
<link rel="stylesheet" type=text/css href="{{ url_for('static', filename='style.css') }}">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
<script src="{{ url_for('static', filename='bootstrap.min.js') }}"></script>
<script type="text/javascript">
function dynInput(cbox) {
console.log(cbox)
if (cbox.checked) {
var input = document.createElement("input");
input.type = "text";
input.id = cbox.name + "_list";
input.name = cbox.name + "_list";
document.getElementById("insertinputs_" + cbox.name).appendChild(input);
} else {
document.getElementById(cbox.name + "_list").remove();
}
}
function dynEnable(cbox) {
console.log(cbox);
var theId = "#" + cbox.name + "_list";
console.log(theId);
if (cbox.checked){
$(theId)[0].disabled = false;
} else {
$(theId)[0].disabled = true;
}
}
function dynThreshold(cbox) {
var theId = "#" + cbox.name + "_threshold";
if (cbox.checked){
$(theId)[0].disabled = false;
} else {
$(theId)[0].disabled = true;
}
}
function forceLower(strInput){
strInput.value=strInput.value.toLowerCase().replace(" ","_");
}
function forceComma(strInput){
strInput.value=strInput.value.replace(" ",",");
}
function forcePositive(strInput){
if (parseInt(strInput.value) <= 1) {
strInput.value = 2
}
}
</script>
</head>
<body>
<div class=page>
{% block body %}{% endblock %}

View File

@@ -0,0 +1,966 @@
{% extends "header.html" %}
{% block body %}
<form action="{{url_for('index')}}" id="builder" method="post" class="form-horizontal">
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Alert Meta</h3>
</div>
</div>
<!-- Alert Name -->
<div class="form-group">
<div class="col-sm-4">
<label for="alert_name" class="control-label">Alert Name:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#alertidModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="alertidModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="myModalLabel">Alert Name</h4>
</div>
<div class="modal-body">
<p>The alert name acts as both the name of the .yaml file and the id for the alert. The
alert name becomes part of what shows up in the title / subject when an alert is
triggered</p>
<p>Picking an alert name that already exists will overwrite the .yaml configuration file so
be aware of what you choose</p>
<p>The Alert name is also how this alert will show up in Victorops, Slack and Email
(Depending on what options you choose for the Alerting</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="alert_name" class="form-control" name="alert_name" value="{{ alert_name }}"
onkeyup="return forceLower(this);">
</div>
</div>
<!-- Check Interval -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="interval">Check Interval: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#intervalModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="intervalModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="checkInterval">Check Interval</h4>
</div>
<div class="modal-body">
<p>The check interval is how often the check will run the query (in seconds) and measure the
results</p>
<p>Anything less than 30 seconds will automatically be bumped up
to 30 seconds. This is due to the fact that metrics are collected every 30 seconds, so
checking more often than this would just result in the same values returned from the
query
as nothing would have changed yet</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="number" id="interval" class="form-control" name="interval" value="{{ interval }}">
</div>
</div>
<!-- Upper Critical Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="criticalUpperThreshold">Upper Critical Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#criticalUpperThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="criticalUpperThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="criticalUpperThresholdTitle">Critical Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query exceeds this
number, a critical alert will trigger.</p>
<p>Only Critical Alerts will also trigger emails and slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<input type="number" class="form-control" id="criticalUpperThreshold" name="critical_upper_threshold"
value="{{ critical_upper_threshold }}"
step="0.01"
onkeypress="validate(event)">
</div>
</div>
<!-- Lower Critical Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="criticalLowerThreshold">Lower Critical Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#criticalLowerThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="criticalLowerThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="criticalLowerThresholdTitle">Lower Critical Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query drops below this
number, a critical alert will trigger.</p>
<p>Only Critical Alerts will also trigger emails and slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<input type="number" class="form-control" id="lower_criticalThreshold" name="critical_lower_threshold"
value="{{ critical_lower_threshold }}"
step="0.01"
onkeypress="validate(event)">
</div>
</div>
<!-- Upper Warning Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="warningUpperThreshold">Upper Warning Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#warningUpperThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="warningUpperThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="warningUpperThresholdTitle">Upper Warning Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query exceeds this
number, a warning alert will trigger.</p>
<p>Warnings will not trigger Email or Slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if warning_upper_threshold %}
{% set warning_upper_checked='checked' %}
{% else %}
{% set warning_upper_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="warning_upper" id="warning_upper" aria-label="..." onclick="dynThreshold(this);" {{
warning_upper_checked }}>
</span>
<input type="number" name="warning_upper_threshold" class="form-control" id="warning_upper_threshold"
value="{{ warning_upper_threshold }}"
aria-label="..." step="0.01" {{ warning_upper_disabled }}>
</div>
</div>
</div>
<!-- Lower Warning Threshold -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="warningLowerThreshold">Lower Warning Threshold: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#warningLowerThresholdModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="warningLowerThresholdModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="warningLowerThresholdTitle">Lower Warning Threshold</h4>
</div>
<div class="modal-body">
<p>This is a Floating Point or Int that when the results back from the query drops below this
number, a warning alert will trigger.</p>
<p>Warnings will not trigger Email or Slack alerts (if set)</p>
<p>Your query needs to be simplified down to just one or two
values per grouping (A start and end metric). The alerting system will look at all
values per grouping and check if any of the values are over the threshold to send out an
alert</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if warning_lower_threshold %}
{% set warning_lower_checked='checked' %}
{% else %}
{% set warning_lower_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="warning_lower" id="warning_lower" aria-label="..." onclick="dynThreshold(this);" {{
warning_lower_checked }}>
</span>
<input type="number" name="warning_lower_threshold" class="form-control" id="warning_lower_threshold"
value="{{ warning_lower_threshold }}"
aria-label="..." step="0.01" {{ warning_lower_disabled }}>
</div>
</div>
</div>
<!-- Occurrences -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="occurrences_threshold">Frequency: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#occurrencesModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="occurrencesModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="occurrencesTitle">Frequency</h4>
</div>
<div class="modal-body">
<p>The occurrences value, when set, will determine how many times the alert has to exceed the
threshold in order for an alert to trigger.</p>
<p>This is particularly useful for metrics that can be spikey and resolve quickly,
using occurrences allows you to only be alerted when a spike is no longer spiking but
maintaining the rate over the period of time</p>
<p>This is compared once every interval, so if your alert is set to 5 minutes, with a
occurrences of 3, you'd have to have the threshold exceeded for 15 minutes before any
alerts
are sent out.</p>
<p>The occurrences value is optional, and if not enabled, the service assumes that after 1 query
exceeding the threshold is enough to trigger alerts. So in this way having an occurrences value
set
to 1 or not enabled does the same thing.</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if occurrences_threshold and occurrences_threshold is number and occurrences_threshold > 1 %}
{% set occurrences_checked='checked' %}
{% else %}
{% set occurrences_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="occurrences" id="occurrences" aria-label="..."
onclick="dynThreshold(this);" {{
occurrences_checked }}>
</span>
<input type="number" name="occurrences_threshold" class="form-control" id="occurrences_threshold"
value="{{ occurrences_threshold }}"
aria-label="..." step="1" min="2" {{ occurrences_disabled }} onkeyup="return forcePositive(this);">
</div>
</div>
</div>
<!-- Tags -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="tags">Tags:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#tagsModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="tagsModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="tagsTitle">Tags</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of tags used to include in the alert subject</p>
<p>In the event of an alert, the tags will be used to look up distinctive
information and
include as part of the alert</p>
<p>For example including the dc tag in an alert means that if an alert occurs, the
alerting
system will look up the dc value from the returned query and included it as part
of the
alert subject</p>
<p>These are the same tag values used to build kiarosdb queries</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" name="tags" id="tags" class="form-control" value="{{ tags }}" ,
onkeyup="return forceComma(this);">
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Notifications</h3>
</div>
</div>
<!-- VictorOps Alerts -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="vo">VictorOps Alert:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#voModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="voModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="voTitle">Victor Ops Alert List</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of victorops routing keys</p>
<p>In the event of an alert, the Ids listed here will recieve a victorops alert</p>
<p>If the checkbox isn't selected, when generating the .yaml config the values
listed will
be ignored</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if vo=="on" %}
{% set vo_checked='checked' %}
{% else %}
{% set vo_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="vo" id="vo" aria-label="..." onclick="dynEnable(this);" {{ vo_checked
}}>
</span>
<input type="text" class="form-control" name="vo_list" id="vo_list" aria-label="..."
value="{{ vo_list }}" onkeyup="return forceComma(this);" {{ vo_disabled }}>
</div>
</div>
</div>
<!-- Email Alerts -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="email">Email Alert:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#emailModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="emailModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="emailTitle">Email Alert List</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of email names to send alerts to</p>
<p>In the event of an alert, the names listed here will recieve an email alert</p>
<p>The alerting system appends an @qualtrics.com to the names listed here, so there
is no
need to include the @domain as it's assumed all alerting emails would go to a
qualtrics
address</p>
<p>Also the SMTP server can only send to @qualtrics addresses anyways</p>
<p>For example sending an email to both netops and devops on an alert would be <b>devops,netops</b>
in the text box.</p>
<p>If the checkbox isn't selected, when generating the .yaml config the values
listed will
be ignored</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if email=="on" %}
{% set email_checked='checked' %}
{% else %}
{% set email_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="email" id="email" aria-label="..." onclick="dynEnable(this);" {{
email_checked }}>
</span>
<input type="text" name="email_list" class="form-control" id="email_list"
value="{{ email_list }}"
aria-label="..." onkeyup="return forceComma(this);" {{ email_disabled }}>
</div>
</div>
</div>
<!-- Slack Alert List -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="slack">Slack Alert:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#slackModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="slackModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="slackTitle">Slack Alert List</h4>
</div>
<div class="modal-body">
<p>A comma seperated list of slack names to send alerts to</p>
<p>In the event of an alert, the names listed here will recieve a slack alert from a
slackbot</p>
<p>You must include a @ for direct message alerts and # for channel alerts</p>
<p>For example, if the DevOps team wanted to get an alert in slack, the value in the
text
box would be <b>#devops</b>.
If I wanted to also include a direct message as well then the value would be
<b>#devops,@codyc</b></p>
<p>Don't troll people with your metric alerts bombing peopls slack, it's unkind</p>
<p>If the checkbox isn't selected, when generating the .yaml config the values
listed will
be ignored</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<span class="input-group-addon">
{% if slack=="on" %}
{% set slack_checked='checked' %}
{% else %}
{% set slack_disabled='disabled' %}
{% endif %}
<input type="checkbox" name="slack" id="slack" aria-label="..." onclick="dynEnable(this);" {{
slack_checked }}>
</span>
<span id="insertinputs_slack"></span>
<input type="text" name="slack_list" class="form-control" id="slack_list"
value="{{ slack_list }}"
aria-label="..." onkeyup="return forceComma(this);" {{ slack_disabled }}>
</div>
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Dashboard</h3>
</div>
</div>
<!-- Dashboard URL -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="query">Dashboard URL:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#dashboardModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="dashboardModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="dashboardTitle">Dashboard URL</h4>
</div>
<div class="modal-body">
<p>Most queries are built based on some dashboard already built in grafana</p>
<p>By including the URL to that dashboard, the oncall engineer recieving the alert
will be
able to click the link in the alert and get a better picture of what this alert
is and
and how it relates to the datacenter</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<input type="text" name="url" id="url" class="form-control" value="{{ url }}">
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Kairosdb Query</h3>
</div>
</div>
<!-- KairosDB Query -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="query">KariosDB Query:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#queryModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="queryModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="queryTitle">KariosDB Query</h4>
</div>
<div class="modal-body">
<p>Paste in your KariosDB Query that you have already worked out.</p>
<p>You can generate your query by going to the <a
href="http://kairosdb-metrics.service.eng.consul:8080/" target="_blank">KariosDB
UI
in eng</a></p>
<p>When generating your metric you will want to get the return values down to just 1
or 2
results per grouping. This can be done by sending the query to the MAX or MIN
aggregators (depending on your logic needs) as the last aggregator in the
query</p>
<p>You will also want to include a time offset, typically 5 minutes is used for when
to
start (as from 5 minutes ago to now). Setting the MAX aggregator to this value
is
usually typical</p>
<p>Once you have generated your query and it's returning the results you expect,
click the
<b>Show Query</b> button on the kairosDB UI and copy the results into this field
</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<textarea name="kairosdb_query" id="kairosdb_query" class="form-control" rows="12" cols="50">{{ kairosdb_query }}</textarea>
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Prometheus Query</h3>
</div>
</div>
<!-- Prometheus URL -->
<div class="form-group">
<div class="col-sm-4">
<label for="prometheus_url" class="control-label">Prometheus URL:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#prometheusurlModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="prometheusurlModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="myModalLabel">Prometheus URL</h4>
</div>
<div class="modal-body">
<p>URL for the prometheus server</p>
<p>Shared, production Prometheus URLs are currently:
<ul>
<li>http://big-trickster.service.eng.consul:9090</li>
</ul>
</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="prometheus_url" class="form-control" name="prometheus_url" value="{{ prometheus_url }}"
onkeyup="return forceLower(this);">
</div>
</div>
<!-- Prometheus Query -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="query">Prometheus Query:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#prometheusQueryModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="prometheusQueryModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="queryTitle">Prometheus Query</h4>
</div>
<div class="modal-body">
<p>Paste in your Prometheus Query that you have already worked out.</p>
<p>You can generate your query by going to the url of your prometheus endpoint. Eng Vis plans on adding a smart router for this in the future so all instances will be exposed via a single smart proxy, but for now you'll need to know the name. </p><p><a
href="http://big-trickster.service.eng.consul:9090/graph" target="_blank">Prometheus Host Metrics
UI
in eng</a>
</p><p>
<a
href="http://big-trickster.service.eng.consul:9090/graph" target="_blank">Prometheus StatsD and other Metrics
UI
in eng</a></p>
<p>When creating a query, keep in mind a single value returned is gonna be the most
useful
, so stuff like "topk(1,yourmetrics)" are gonna be good choices. However, if
your query has multiple return values AOM will use last value.</p>
<p>So if you use a step/duration of 60 and a timspan of 300 between start
and
end you'll get back 5 values and the last will be used.
</p>
<p><a href="https://prometheus.io/docs/prometheus/latest/querying/functions/" target="_blank">Prometheus Functions</a></p>
<p>
<a href="https://prometheus.io/docs/prometheus/latest/querying/operators/" target="_blank">Prometheus Operators</a>
</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-7">
<textarea name="prometheus_query" id="prometheus_query" class="form-control" rows="12" cols="50">{{ prometheus_query }}</textarea>
</div>
</div>
<!-- Start Time -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="start_time">Start Time: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#startTimeModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="startTimeModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="startTime">Start Time</h4>
</div>
<div class="modal-body">
<p>This should be a relative time in seconds like '-600' for 10m, defaults to '-300'</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="start_time" class="form-control" name="start_time" value="{{ start_time }}">
</div>
</div>
<!-- End Time -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="end_time">End Time: </label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#endTimeModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="endTimeModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="endTime">End Time</h4>
</div>
<div class="modal-body">
<p>This can be 'now' (default) or some relative offset like '-30' in seconds</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="text" id="end_time" class="form-control" name="end_time" value="{{ end_time }}">
</div>
</div>
<div class="row">
<div class="col-sm-12">
<h3 class="text-center">Actions</h3>
</div>
</div>
<!-- Load Config File -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="loadFile">Load Config From File:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#loadModal">info
</button>
<!-- Modal -->
<div class="modal fade" id="loadModal" tabindex="-1" role="dialog" aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="loadTitle">Load Config from file</h4>
</div>
<div class="modal-body">
<p>Load a config already generated to file into the UI</p>
<p>This is handy when you need to make minor changes to a query, or add additional
alerting
values or change thresholds. Or if you are just terrified of yaml.</p>
<p>Hit the drop down to see a list of all alert configs (the names generated from
the values
used in the Alert Name field) Hit the Go and the config will load into all the
fields</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<div class="input-group">
<select name="loadFile" id="loadFile" class="form-control">
<option value="" selected></option>
{% for f in alert_list %}
<option value="{{ f }}">{{ f }}</option>
{% endfor %}
</select>
<span class="input-group-btn">
<input type="submit" name="generate" id="submitFiles" class="btn btn-primary" value="Go">
</span>
</div>
</div>
</div>
<!-- Submit Form -->
<div class="form-group">
<div class="col-sm-4">
<label class="control-label" for="submit">Generate YAML:</label>
</div>
<div class="col-sm-1">
<!-- Button trigger modal -->
<button type="button" class="btn btn-info btn-xs" data-toggle="modal" data-target="#generateModal">
info
</button>
<!-- Modal -->
<div class="modal fade" id="generateModal" tabindex="-1" role="dialog"
aria-labelledby="myModalLabel">
<div class="modal-dialog" role="document">
<div class="modal-content">
<div class="modal-header">
<h4 class="modal-title" id="generateTitle">Generate Alert Config</h4>
</div>
<div class="modal-body">
<p>When you are ready to take the values in the form and generate a alert config
.yaml file,
hit the button</p>
<p>This will generate a .yaml file based on the alert name. So for example if one
was to
have the value <b>mcp_errors_per_dc</b> as an alert name, the resulting file
would be
<b>mcp_errors_per_dc.yaml</b></p>
<p>This <b>will</b> overwrite a .yaml file if the alert name is the same as an
already
existing file</p>
<p>If there are any errors generating the config, the resulting page will include
the error
message and give you the ability to return back to this page with your form
saved</p>
</div>
<div class="modal-footer">
<button type="button" class="btn btn-default" data-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-7">
<input type="submit" id="submit" name='generate' class='btn btn-primary' value="generate"
class="button">
</div>
</div>
</form>
{% endblock %}

View File

@@ -0,0 +1,4 @@
{% extends "header.html" %}
{% block body %}
<h2>Complete all values in the form below</h2>
{% endblock %}

View File

@@ -0,0 +1,69 @@
# views.py
import glob
import json
import os
import yaml
from flask import session
from library.logger import AlertLogging
from webapp import app, render_template, request, render
logger = AlertLogging('aom')
logger.start()
logger.start_log_file("logs/aom_service.log")
@app.route('/', methods=['GET', 'POST'])
def index():
logger.debug("Request Method: {}".format(request.method))
if request.method == 'GET':
# GET BLOB OF FILES
service_config = yaml.load(open('service.yaml', 'r').read())
alert_list = sorted([os.path.splitext(os.path.basename(x))[0] for x in
glob.glob(service_config['alert_folder'] + "/*.yaml")])
if 'yaml_config' in session:
return render_template('index.html', **json.loads(session['yaml_config']), alert_list=alert_list)
else:
return render_template('index.html', alert_list=alert_list)
elif request.method == 'POST':
logger.info("Got a form")
if 'go' in request.form['generate'].lower():
return re_build(request.form['loadFile'])
yaml_config = dict()
ret = ''
try:
for field_name, value in request.form.items():
yaml_config[field_name] = value
code, ret = render.render_config(yaml_config)
assert code == 0
return render_template('debug.html', query=yaml_config,
file_path='alert_configs/{}.yaml'.format(yaml_config['alert_name']),
file_contents=ret.split('\n'))
except AssertionError:
session['yaml_config'] = json.dumps(yaml_config)
return render_template('error.html', message="Failed to render to file: {}".format(ret))
except Exception as e:
return render_template('error.html', message=str(e))
@app.route('/build/<alert_id>', methods=['POST'])
def re_build(alert_id):
# READ IN CONFIG FROM ID
config = render.render_yaml(alert_id)
service_config = yaml.load(open('service.yaml', 'r').read())
alert_list = sorted([os.path.splitext(os.path.basename(x))[0] for x in
glob.glob(service_config['alert_folder'] + "/*.yaml")])
return render_template('index.html', **config, alert_list=alert_list)
@app.route("/debug/")
def toggle_debug():
if logger.debug_handler:
logger.stop_debug()
logger.info("Debug Stopped")
else:
logger.start_debug()
logger.debug("Debug Started")
return index()

View File

@@ -0,0 +1,3 @@
requests
pyaml
flask