Skip to end of metadata
Go to start of metadata

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

Version 1 Next »




blueprint:

NorthStar (v5) and HealthBot (v2.0.2)

video


NortStar or NS login

https://<ip@><port>

admin // Juniper!1

HealthBot

SSH or https://

jcluser // Juniper!1





healthbot_listener.py


It's a webhook, listen for HB messages 

Usage:

jcluser@ubuntu:~$ ls
healthbot-2.0.2-1.deb self_healing


jcluser@ubuntu:~$ cd self_healing/
jcluser@ubuntu:~/self_healing$ ls
healthbot_listener.py maintenance.j2 README.md RPMprobe.yml user_functions.py user_functions.pyc


jcluser@ubuntu:~/self_healing$ python healthbot_listener.py
/usr/local/lib/python2.7/dist-packages/urllib3/connectionpool.py:1004: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning,
* Serving Flask app "healthbot_listener" (lazy loading)
* Environment: production
WARNING: This is a development server. Do not use it in a production deployment.
Use a production WSGI server instead.
* Debug mode: off
* Running on http://0.0.0.0:10000/ (Press CTRL+C to quit)

HealthBot Listener script
# We need to import request to access the details of the POST request
from flask import Flask, request
from flask_restful import abort
import commands
import json
import pprint
import requests
import os
import user_functions
requests.packages.urllib3.disable_warnings()

# Initialize the Flask application
app = Flask(__name__)

@app.route('/', methods=['POST'])
def app_message_post():
    print "#################  Start  #######################"
    if request.headers['Content-Type'] != 'application/json':
        abort(400, message="Expected Content-Type = application/json")
    try:
        data = request.json
        print data
        message = data['message']
        print "message " + message
        playbook_name = data['keys']['_playbook_name']
        print "playbook_name " + playbook_name
        #test_name = data['keys']['test-name']
        #print "interface-name " + test_name
        device_id = data['device-id']
        #print device_id + "  " + message
        #if spec['eventRuleId'] == g_rule_id:
        #    state = status['state']
        #    device_id = status['entityId']
        #    if state == "active" and device_id == g_device_id:
        #        print 'DATA_ACTIVE :: ', pprint.pprint(data)
        #        user_functions.move_traffic()
        #        print 'traffic detoured and Slack was notified'
        #    elif state == "inactive":
        #        #print 'DATA_INACTIVE :: ', pprint.pprint(data)
        #        print 'LSP path can be changed back'
        #return json.dumps({'result': 'OK'})
        if playbook_name == "cpu_openconfig":
            print "received cpu high alert"
            if "exceeds high threshold" in message:
                print 'CPU HIGH UTIL DETECTED for ' + device_id
                print 'PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for ' + device_id
                #create maintenance for simulation purpose
                rest_index_number = user_functions.get_node_info(device_id)
                rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_simulation', 'node')
                maintenance_event = user_functions.create_maintenance(rest_payload)
                maintenance_index = maintenance_event.json()['maintenanceIndex']
                check_simulation = user_functions.check_if_simulation_pass()
                print "simulation result " + check_simulation
                user_functions.delete_maintenance(maintenance_index)
                print "delete temp maintenace"
                if check_simulation == 'true':
                    print 'CPU HIGH UTIL DETECTED PUT NODE UNDER MAINTENANCE::'
                    # pprint.pprint(data)
                    #print "rest_node_name, rest_index_number" +  rest_node_name +  rest_index_number
                    rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_maint', 'node')
                    print rest_payload
                    user_functions.create_maintenance(rest_payload)
                else:
                    print 'CANNOT PUT ' + device_id + ' UNDER MAINTENANCE. EXHUASTIVE FAILURE SIMULATION NOT PASSED'
            elif "is normal" in message:
                #print 'DATA_INACTIVE :: ', pprint.pprint(data)
                print 'CPU util back to normal. '
        print '###############################'
        if playbook_name == "delay":
            print "received delay alert"
            source_address = data['keys']['source-address']
            #print "interface-ip " + source_address
            #target_address = data['keys']['target_address']
            #print "message" + message
            if "exceeds delay threshold" in message:
                print "HIGH DELAY DETECTED for  " + device_id + " " + source_address
                print "PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for " + device_id + " " + source_address
                #create maintenance for simulation purpose
                rest_index_number = user_functions.get_link_info_from_ip(source_address)
                rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_simulation', 'link')
                maintenance_event = user_functions.create_maintenance(rest_payload)
                maintenance_index = maintenance_event.json()['maintenanceIndex']
                check_simulation = user_functions.check_if_simulation_pass()
                print "SIMULATION RESULT " + check_simulation
                user_functions.delete_maintenance(maintenance_index)
                #print "delete temp maintenace"
                if check_simulation == "true":
                    print "HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::"
                    # pprint.pprint(data)
                    #print "rest_node_name, rest_index_number" +  rest_node_name +  rest_index_number
                    rest_payload = user_functions.generate_maitenance_json(rest_index_number, 'for_maint', 'link')
                    print rest_payload
                    user_functions.create_maintenance(rest_payload)
                else:
                    print "CANNOT PUT " + device_id + " " + source_address + " UNDER MAINTENANCE. EXHUASTIVE FAILURE SIMULATION NOT PASSED"
            elif "is normal" in message:
                #print 'DATA_INACTIVE :: ', pprint.pprint(data)
                print "DELAY back to normal. "
        print "###############################"
        """
        if event_rule_id == AppFormixInterfaceL3IncompleteEventID:
            print "Received interface l3 incomplete alert"
            if state == "active":
                rest_payload = user_functions.generate_link_maitenance_json()
                print rest_payload
                user_functions.create_maintenance(rest_payload)
                print 'Put problematic link into maintenance mode'
            elif state == "inactive":
            # print 'DATA_INACTIVE :: ', pprint.pprint(data)
                print 'link back to normal. you can complete the maintenance event'
        """
        return json.dumps({'result': 'OK'})
    except Exception as e:
        abort(400, message="Exception processing request: {0}".format(e))
        print '...'


if __name__ == '__main__':
    app.run(
        host="0.0.0.0",
        port=int("10000")
    )

user_function.py

user_functions.py
import json
from pprint import pprint
import os
from jinja2 import Environment, FileSystemLoader
import datetime
import time
import requests

url = 'http://100.123.16.0:8091/Northstar/API/v2/tenant/1/topology/1/'
node_url_test = url + 'nodes'

node_url = url + 'nodes'
link_url = url + 'links'
lsp_url = url + 'te-lsps'
token_url = 'https://100.123.16.0:8443/oauth2/token'
maintenance_url = url + 'maintenances'
run_simulation_url = url + 'rpc/simulation'
hearders_token = {'Content-Type': 'application/json'}
user = 'admin'
password = 'Juniper!1'

def get_token():
    r = requests.post(token_url, auth=('admin', 'Juniper!1'), data='{"grant_type":"password","username":"admin","password":"Juniper!1"}', he
aders=hearders_token, verify=False)
    return r.json()['access_token']

token = get_token()
headers = {'Authorization': str('Bearer ' + token), 'Content-Type': 'application/json'}

def get_node_info(hostname):
    network_info = get_node()
    for i in network_info.json():
        if i['hostName'] == hostname:
            index_number = i['nodeIndex']
    return index_number

def get_link_info(linkname):
    network_info = get_link()
    for i in network_info.json():
        if i['name'] == linkname:
            index_number = i['linkIndex']
    return index_number

def get_link_info_from_ip(interface_ip):
    network_info = get_link()
    for i in network_info.json():
      if (i['endA']['ipv4Address']['address'] == interface_ip) or (i['endZ']['ipv4Address']['address'] == interface_ip):
        index_number = i['linkIndex']
    return index_number

def get_link_from_nodeID_and_interface(nodeID,interface_name):
    network_info = get_link()
    for i in network_info.json():
        if ((i['endA']['node']['id'] == nodeID) and (i['endA']['interfaceName'] == interface_name)) or ((i['endZ']['node']['id'] == nodeID)
and (i['endZ']['interfaceName'] == interface_name)):
            link = i
    return link


def get_nodeID_from_hostname(hostname):
    network_info = get_node()
    for i in network_info.json():
        if i['hostName'] == hostname:
            nodeID = i['id']
    return nodeID

'''
def move_traffic():
    contents = open('new_path.json', 'rb').read()
    print(contents)
    r = requests.post(lsp_url, data=contents, headers=headers, verify=False)
    # print(r)

def move_traffic2():
    contents = open('new_path.json', 'rb').read()
    print(contents)
    r = requests.put(lsp_url, data=contents, headers=headers, verify=False)
    # print(r)

def move_traffic_back():
    contents = open('original_path.json', 'rb').read()
    print(contents)
    r = requests.post(lsp_url, data=contents, headers=headers, verify=False)
'''

def get_node():
    r = requests.get(node_url, headers=headers, verify=False)
    return (r)

def get_link():
    r = requests.get(link_url, headers=headers, verify=False)
    return (r)

def create_maintenance(payload):
    print(payload)
    r = requests.post(maintenance_url, data=payload, headers=headers, verify=False)
    return r

def delete_maintenance(maint_index):
    maint_index = str(maint_index)
    delete_maint_url = maintenance_url + '/' + maint_index
    r = requests.delete(delete_maint_url, headers=headers, verify=False)
    return r

def generate_maitenance_json(index_number, use, maintenance_type):
    #start = 1 for now
    # end = 6000
    maintenance_type = maintenance_type
    current_time=datetime.datetime.utcnow().strftime("%Y%m%d%H%M")
    if use == 'for_simulation':
        name = 'created_for_simulation'
        start = 3600
        end = 6000
    else:
        name = 'Healthbot-' + maintenance_type + '-health-alert' + current_time
        start = 1
        end = 6000
    THIS_DIR = os.path.dirname(os.path.abspath('__file__'))
    j2_env = Environment(loader=FileSystemLoader(THIS_DIR),
                         trim_blocks=True)

    payload = j2_env.get_template('maintenance.j2').render(
        maintenance_type=maintenance_type,
        index_number=index_number,
        current_time=current_time,
        name=name,
        start_time=getTimeSeqUTC(start),
        end_time=getTimeSeqUTC(end)
    )
    return (payload)

'''
def generate_link_maitenance_json():
    index_number = get_link_info("L10.135.5.1_10.135.5.2")
    THIS_DIR = os.path.dirname(os.path.abspath(__file__))

    j2_env = Environment(loader=FileSystemLoader(THIS_DIR),
                         trim_blocks=True)

    payload = j2_env.get_template('maintenance.j2').render(
        maintenance_type='link',
        index_number=index_number,
        current_time=datetime.datetime.utcnow().strftime("%Y%m%d%H%M"),
        start_time=getTimeSeqUTC(1),
        end_time=getTimeSeqUTC(6000)
    )
    return payload
'''

def generate_link_traffic_threshold_payload(linkIndex,linkID,endA_ID,endZ_ID,endA_Threshold,endZ_Threshold):
    THIS_DIR = os.path.dirname(os.path.abspath('__file__'))
    j2_env = Environment(loader=FileSystemLoader(THIS_DIR),
                         trim_blocks=True)
    payload = j2_env.get_template('link_traffic_threshold.j2').render(
        linkID=linkID,
        linkIndex=linkIndex,
        endA_ID=endA_ID,
        endZ_ID=endZ_ID,
        endA_Threshold= endA_Threshold,
        endZ_Threshold= endZ_Threshold
    )
    return payload

def update_link_traffic_threshold(payload, linkIndex):
    print payload
    linkIndex = str(linkIndex)
    linkThresholdURL = link_url + '/' +  linkIndex
    print linkThresholdURL
    r = requests.put(linkThresholdURL, data=payload, headers=headers, verify=False)
    return r

def getTimeSeqUTC(num):
    # tz = pytz.timezone('America/New_York')
    # a = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
    a = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    b_start = time.mktime(time.strptime(a, '%Y-%m-%d %H:%M:%S')) + int(num) * 60
    dateA = str(time.strftime("%Y%m%d", time.localtime(b_start)))
    timeA = str(time.strftime("%H%M", time.localtime(b_start)))
    juniorTime = 'T'.join([dateA, timeA])
    endstr = "00"
    finalTime = ''.join([juniorTime, endstr])
    return finalTime + 'Z'


def set_overload_bit(router_to_configure):
    global user
    global password
    print "making confugration changes on " + router_to_configure
    print "#####################################"
    dev = Device(host=router_to_configure, user=user, password=password).open()
    with Config(dev) as cu:
        cu.load('set protocols isis overload', format='set')
        cu.pdiff()
        cu.commit()


def get_management_ip(host_name):
    network_info = './network_device.json'
    data = json.loads(open(network_info).read())
    for i in data['NetworkDeviceList']:
        # pprint(i)
        # i['NetworkDevice']['Name']
        if i['NetworkDevice']['Name'] == host_name:
            management_ip = i['NetworkDevice']['ManagementIp']
    return management_ip


def run_simulation(simulation_name):
    simulation_name = simulation_name
    simulation_type = "link"
    simulation_payload = '{"topoObjectType":"maintenance","topologyIndex":1,"elements":[{"type":"maintenance","maintenanceName":"' + simulat
ion_name + '"},"' + simulation_type + '"]}'
    r = requests.post(run_simulation_url, data=simulation_payload, headers=headers, verify=False)
    return r


def get_simulation_report(simulationID):
    simulationID = simulationID
    simulation_report_url = url + 'rpc/simulation/' + simulationID + '/Report/L2_PeakSimRoute.r0'
    r = requests.get(simulation_report_url, headers=headers, verify=False)
    return r


def check_if_simulation_pass():
    check_passed = 'true'
    simulation_name = 'created_for_simulation'
    simulation_type = "link"
    simulation_payload = '{"topoObjectType":"maintenance","topologyIndex":1,"elements":[{"type":"maintenance","maintenanceName":"' + simulat
ion_name + '"},"' + simulation_type + '"]}'
    r = requests.post(run_simulation_url, data=simulation_payload, headers=headers, verify=False)
    simulationID=r.json()['simulationId']
    simulation_report_url = url + 'rpc/simulation/' + simulationID + '/Report/L2_PeakSimRoute.r0'
    report = requests.get(simulation_report_url, headers=headers, verify=False)
    if "NotRouted" in report.content:
      check_passed = 'false'
    return check_passed

def print_simulation_failure_content(report):
    lines = report.content.split('\n')
    for line in lines:
        if '#' in line:
            print line
        elif '*' in line:
            print line
        elif 'S' in line:
            line = line.split(',')
            print line[0] + ',' + line[1] + ',' + line[2] + ',' + line[3] + ',' + line[4] + ',' + line[5] + ','  + line[6]
















add delay 

Log into the CentOS  ( jcluser / Juniper!1 )

[root@CentOS ~]# more add_delay.sh
tc qdisc add dev eth1 root netem delay 300ms
echo "300 ms delay added between vMX-5 and vMX-6"
[root@CentOS ~]#



healthbot_listener.py output

HealthBot : monitopr the RPM 

HB >>> threshold cross >>> change the status on the HB   + send an "put in maintenance the link" to NortStar

output
jcluser@ubuntu:~/self_healing$ python healthbot_listener.py
/usr/local/lib/python2.7/dist-packages/urllib3/connectionpool.py:1004: InsecureRequestWarning: Unverified HTTPS request is being ma        de. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warn        ings
  InsecureRequestWarning,
 * Serving Flask app "healthbot_listener" (lazy loading)
 * Environment: production
   WARNING: This is a development server. Do not use it in a production deployment.
   Use a production WSGI server instead.
 * Debug mode: off
 * Running on http://0.0.0.0:10000/ (Press CTRL+C to quit)



 #################  Start  #######################
{u'group': u'group_all', u'severity': u'major', u'keys': {u'_instance_id': u'["delay"]', u'source-address': u'7.105.106.1', u'_play        book_name': u'delay'}, u'device-id': u'vMX-5', u'rule': u'probe_delay', u'topic': u'probe-delay', u'trigger': u'probe_exceed', u'me        ssage': u'7.105.106.1 ge-0/0/3.0 delay is 303099  exceeds delay threshold 200000 us.  '}
message 7.105.106.1 ge-0/0/3.0 delay is 303099  exceeds delay threshold 200000 us.
playbook_name delay
###############################

received delay alert
HIGH DELAY DETECTED for  vMX-5 7.105.106.1
PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for vMX-5 7.105.106.1
    {
        "topoObjectType": "maintenance",
        "topologyIndex": 1,
        "user": "admin",
        "name": "created_for_simulation",
        "startTime": "20200716T230300Z",
        "endTime": "20200718T150300Z",
        "elements": [
            {
                "topoObjectType": "link",
                "index": 9
            }
        ]
    }

SIMULATION RESULT true
HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::
    {
        "topoObjectType": "maintenance",
        "topologyIndex": 1,
        "user": "admin",
        "name": "Healthbot-link-health-alert202007141103",
        "startTime": "20200714T110400Z",
        "endTime": "20200718T150300Z",
        "elements": [
            {
                "topoObjectType": "link",
                "index": 9
            }
        ]
    }

    {
        "topoObjectType": "maintenance",
        "topologyIndex": 1,
        "user": "admin",
        "name": "Healthbot-link-health-alert202007141103",
        "startTime": "20200714T110400Z",
        "endTime": "20200718T150300Z",
        "elements": [
            {
                "topoObjectType": "link",
                "index": 9
            }
        ]
    }

###############################
172.22.0.3 - - [14/Jul/2020 04:03:29] "POST / HTTP/1.1" 200 -
#################  Start  #######################
{u'group': u'group_all', u'severity': u'major', u'keys': {u'_instance_id': u'["delay"]', u'source-address'                                 : u'7.105.106.2', u'_playbook_name': u'delay'}, u'device-id': u'vMX-6', u'rule': u'probe_delay', u'topic':                                  u'probe-delay', u'trigger': u'probe_exceed', u'message': u'7.105.106.2 ge-0/0/3.0 delay is 274028  exceed                                 s delay threshold 200000 us.  '}
message 7.105.106.2 ge-0/0/3.0 delay is 274028  exceeds delay threshold 200000 us.
playbook_name delay
###############################
received delay alert
HIGH DELAY DETECTED for  vMX-6 7.105.106.2
PERFORMING EXHUASTIVE LINK FAILURE SIMULATION for vMX-6 7.105.106.2
    {
        "topoObjectType": "maintenance",
        "topologyIndex": 1,
        "user": "admin",
        "name": "created_for_simulation",
        "startTime": "20200716T230400Z",
        "endTime": "20200718T150400Z",
        "elements": [
            {
                "topoObjectType": "link",
                "index": 9
            }
        ]
    }

SIMULATION RESULT true
HIGH DELAY DETECTED PUT LINK UNDER MAINTENANCE::
    {
        "topoObjectType": "maintenance",
        "topologyIndex": 1,
        "user": "admin",
        "name": "Healthbot-link-health-alert202007141104",
        "startTime": "20200714T110500Z",
        "endTime": "20200718T150400Z",
        "elements": [
            {
                "topoObjectType": "link",
                "index": 9
            }
        ]
    }

    {
        "topoObjectType": "maintenance",
        "topologyIndex": 1,
        "user": "admin",
        "name": "Healthbot-link-health-alert202007141104",
        "startTime": "20200714T110500Z",
        "endTime": "20200718T150400Z",
        "elements": [
            {
                "topoObjectType": "link",
                "index": 9
            }
        ]
    }

###############################
172.22.0.3 - - [14/Jul/2020 04:04:29] "POST / HTTP/1.1" 200 -


WebHook ( send to webserver ) on HB

1- WebHook config under: Settings Menu >> Notification Settings

2- Add the WebHook Notification to the Device group under:  Dashboard >> Select a Device group 



 



2- Add the WebHook Notification to the Device group under:  Dashboard >> Select a Device group








  • No labels