|
|
---|
blueprint: NorthStar (v5) and HealthBot (v2.0.2) | |
video | |
Log into the GUI |
|
NortStar NorthStar or NS login | https://<ip@><port>admin <ip@>:<port> ( admin // Juniper!11 ) |
HealthBot | SSH or https://jcluser ( jcluser // Juniper!11 ) |
Add Allowed network prefix | Commands menu >> Add Allowed network prefix >>> <enter ip address of client> |
CLI on Healthbot |
|
Start healthbot_listener.py ( Webhook + trigger NS to put the link in maintenance) | Log into ssh jcluser@healthbot ( jcluser // Juniper!1 ) It's a webhook, listen for HB messages
Usage: jcluser@ubuntu:~$ ls healthbot-2.0.2-1.deb self_healing jcluser@ubuntu:~$ cd self_healing/ jcluser@ubuntu:~/self_healing$ ls healthbot_listener.py maintenance.j2 README.md RPMprobe.yml user_functions.py user_functions.pyc
jcluser@ubuntu:~/self_healing$ python healthbot_listener.py/usr/local/lib/python2.7/dist-packages/urllib3/connectionpool.py:1004: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning, * Serving Flask app "healthbot_listener" (lazy loading) * Environment: production WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. * Debug mode: off * Running on http://0.0.0.0:10000/ (Press CTRL+C to quit) Code Block |
---|
|
title | HealthBot Listener script |
---|
collapse | true |
---|
|
# We need to import request to access the details of the POST request
from flask import Flask, request
from flask_restful import abort
import commands
import json
import pprint
import requests
import os
import user_functions
requests.packages.urllib3.disable_warnings()
# Initialize the Flask application
app = Flask(__name__)
@app.route('/', methods=['POST'])
def app_message_post():
print "################# Start #######################"
if request.headers['Content-Type'] != 'application/json':
abort(400, message="Expected Content-Type = application/json")
try:
data = request.json
print data
message = data['message']
print "message " + message
playbook_name = data['keys']['_playbook_name']
print "playbook_name " + playbook_name
#test_name = data['keys']['test-name']
#print "interface-name " + test_name
device_id = data['device-id']
#print device_id + " " + message
#if spec['eventRuleId'] == g_rule_id:
# state = status['state']
# device_id = status['entityId']
# if state == "active" and device_id == g_device_id:
# print 'DATA_ACTIVE :: ', pprint.pprint(data)
# user_functions.move_traffic()
# print 'traffic detoured and Slack was notified'
# elif state == "inactive":
# #print 'DATA_INACTIVE :: ', pprint.pprint(data)
# print 'LSP path can be changed back'
#return json.dumps({'result': 'OK'})
if playbook_name == "cpu_openconfig":
print "received cpu high alert"
if "exceeds high threshold" in message:
print 'CPU HIGH UTIL DETECTED for ' + device_id
print 'PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for ' + device_id
#create maintenance for simulation purpose
rest_index_number = user_functions.get_node_info(device_id)
rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_simulation', 'node')
maintenance_event = user_functions.create_maintenance(rest_payload)
maintenance_index = maintenance_event.json()['maintenanceIndex']
check_simulation = user_functions.check_if_simulation_pass()
print "simulation result " + check_simulation
user_functions.delete_maintenance(maintenance_index)
print "delete temp maintenace"
if check_simulation == 'true':
print 'CPU HIGH UTIL DETECTED PUT NODE UNDER MAINTENANCE::'
# pprint.pprint(data)
#print "rest_node_name, rest_index_number" + rest_node_name + rest_index_number
rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_maint', 'node')
print rest_payload
user_functions.create_maintenance(rest_payload)
else:
print 'CANNOT PUT ' + device_id + ' UNDER MAINTENANCE. EXHUASTIVE FAILURE SIMULATION NOT PASSED'
elif "is normal" in message:
#print 'DATA_INACTIVE :: ', pprint.pprint(data)
print 'CPU util back to normal. '
print '###############################'
if playbook_name == "delay":
print "received delay alert"
source_address = data['keys']['source-address']
#print "interface-ip " + source_address
#target_address = data['keys']['target_address']
#print "message" + message
if "exceeds delay threshold" in message:
print "HIGH DELAY DETECTED for " + device_id + " " + source_address
print "PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for " + device_id + " " + source_address
#create maintenance for simulation purpose
rest_index_number = user_functions.get_link_info_from_ip(source_address)
rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_simulation', 'link')
maintenance_event = user_functions.create_maintenance(rest_payload)
maintenance_index = maintenance_event.json()['maintenanceIndex']
check_simulation = user_functions.check_if_simulation_pass()
print "SIMULATION RESULT " + check_simulation
user_functions.delete_maintenance(maintenance_index)
#print "delete temp maintenace"
if check_simulation == "true":
print "HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::"
# pprint.pprint(data)
#print "rest_node_name, rest_index_number" + rest_node_name + rest_index_number
rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_maint', 'link')
print rest_payload
user_functions.create_maintenance(rest_payload)
else:
print "CANNOT PUT " + device_id + " " + source_address + " UNDER MAINTENANCE. EXHUASTIVE FAILURE SIMULATION NOT PASSED"
elif "is normal" in message:
#print 'DATA_INACTIVE :: ', pprint.pprint(data)
print "DELAY back to normal. "
print "###############################"
"""
if event_rule_id == AppFormixInterfaceL3IncompleteEventID:
print "Received interface l3 incomplete alert"
if state == "active":
rest_payload = user_functions.generate_link_maitenance_json()
print rest_payload
user_functions.create_maintenance(rest_payload)
print 'Put problematic link into maintenance mode'
elif state == "inactive":
# print 'DATA_INACTIVE :: ', pprint.pprint(data)
print 'link back to normal. you can complete the maintenance event'
"""
return json.dumps({'result': 'OK'})
except Exception as e:
abort(400, message="Exception processing request: {0}".format(e))
print '...'
if __name__ == '__main__':
app.run(
host="0.0.0.0",
port=int("10000")
)
Create two Tunnels: PCEP and NETCONF |
|
| Tunnel 1 = jlk_4_to_1_PCEP ( Control type: PCEInitiated ) Path Computation Element Protocol Tunnel 2 = jlk_4_to_1_Netconf ( Control type: Device Controlled ) and Application Menu >> Path Optimization >> path optimization >> Enable + 1 minute
Image Added |
| Image Added
|
Set the Path Optimization every minutes |
Image Added
Image Added |
Display the Delay Tab |
Image Added
Image Added
|
Connect on the CentOS ( to create delay ) |
|
add delay | Log into the CentOS ( jcluser / Juniper!1 ) su - ( Juniper!1 ) ./add_delay.sh
[root@CentOS ~]# more add_delay.sh tc qdisc add dev eth1 root netem delay 300ms echo "300 ms delay added between vMX-5 and vMX-6" [root@CentOS ~]#
|
SSH to HealthBot |
|
healthbot_listener.py output HealthBot : monitopr the RPM HB >>> threshold cross >>> change the status on the HB + send an "put in maintenance the link" to NortStar Code Block |
---|
|
|
jcluser@ubuntu:~/self_healing$ python healthbot_listener.py
/usr/local/lib/python2.7/dist-packages/urllib3/connectionpool.py:1004: InsecureRequestWarning: Unverified HTTPS request is being ma de. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warn ings
InsecureRequestWarning,
* Serving Flask app "healthbot_listener" (lazy loading)
* Environment: production
WARNING: This is a development server. Do not use it in a production deployment.
Use a production WSGI server instead.
* Debug mode: off
* Running on http://0.0.0.0:10000/ (Press CTRL+C to quit)
################# Start #######################
{u'group': u'group_all', u'severity': u'major', u'keys': {u'_instance_id': u'["delay"]', u'source-address': u'7.105.106.1', u'_play book_name': u'delay'}, u'device-id': u'vMX-5', u'rule': u'probe_delay', u'topic': u'probe-delay', u'trigger': u'probe_exceed', u'me ssage': u'7.105.106.1 ge-0/0/3.0 delay is 303099 exceeds delay threshold 200000 us. '}
message 7.105.106.1 ge-0/0/3.0 delay is 303099 exceeds delay threshold 200000 us.
playbook_name delay
###############################
received delay alert
HIGH DELAY DETECTED for vMX-5 7.105.106.1
PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for vMX-5 7.105.106.1
{
"topoObjectType": "maintenance",
"topologyIndex": 1,
"user": "admin",
"name": "created_for_simulation",
"startTime": "20200716T230300Z",
"endTime": "20200718T150300Z",
"elements": [
{
"topoObjectType": "link",
"index": 9
}
]
}
SIMULATION RESULT true
HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::
{
"topoObjectType": "maintenance",
"topologyIndex": 1,
"user": "admin",
"name": "Healthbot-link-health-alert202007141103",
"startTime": "20200714T110400Z",
"endTime": "20200718T150300Z",
"elements": [
{
"topoObjectType": "link",
"index": 9
}
]
}
{
"topoObjectType": "maintenance",
"topologyIndex": 1,
"user": "admin",
"name": "Healthbot-link-health-alert202007141103",
"startTime": "20200714T110400Z",
"endTime": "20200718T150300Z",
"elements": [
{
"topoObjectType": "link",
"index": 9
}
]
}
###############################
172.22.0.3 - - [14/Jul/2020 04:03:29] "POST / HTTP/1.1" 200 -
################# Start #######################
{u'group': u'group_all', u'severity': u'major', u'keys': {u'_instance_id': u'["delay"]', u'source-address' : u'7.105.106.2', u'_playbook_name': u'delay'}, u'device-id': u'vMX-6', u'rule': u'probe_delay', u'topic': u'probe-delay', u'trigger': u'probe_exceed', u'message': u'7.105.106.2 ge-0/0/3.0 delay is 274028 exceed s delay threshold 200000 us. '}
message 7.105.106.2 ge-0/0/3.0 delay is 274028 exceeds delay threshold 200000 us.
playbook_name delay
###############################
received delay alert
HIGH DELAY DETECTED for vMX-6 Monitor Output
received delay alert HIGH DELAY DETECTED for vMX-6 7.105.106.2
PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for vMX-6 7.105.106.2
{
"topoObjectType": "maintenance",
"topologyIndex": 1,
"user": "admin",
"name": "created_for_simulation",
"startTime": "20200716T230400Z",
"endTime": "20200718T150400Z",
"elements": [
{
"topoObjectType": "link",
"index": 9
}
]
}
SIMULATION RESULT true
HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::
{
"topoObjectType": "maintenance",
"topologyIndex": 1,
"user": "admin",
"name": "Healthbot-link-health-alert202007141104",
"startTime": "20200714T110500Z",
"endTime": "20200718T150400Z",
"elements": [
{
"topoObjectType": "link",
"index": 9
}
]
}
{
"topoObjectType": "maintenance",
"topologyIndex": 1,
"user": "admin",
"name": "Healthbot-link-health-alert202007141104",
"startTime": "20200714T110500Z",
"endTime": "20200718T150400Z",
"elements": [
{
"topoObjectType": "link",
"index": 9
}
]
}
###############################
172.22.0.3 - - [14/Jul/2020 04:04:29] "POST / HTTP/1.1" 200 -
| WebHook ( send to webserver ) on HB | 1- WebHook config under: Settings Menu >> Notification Settings2- Add the WebHook Notification to the Device group under: Dashboard >> Select a Device group Image Removed Image Removed 2- Add the WebHook Notification to the Device group under: Dashboard >> Select a Device groupImage Removed
SIMULATION RESULT true HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::
| On NorthStar | |
1- Link goes on maintenance mode 2- Tunnel re-router to avoid the link | Image Added |
Go to the maintenance tab | Image Added
Image Added |
|
|