Bob Lantz
Committed by Gerrit Code Review

Add status/error checks during ONOS startup

We check for:
- process exits
- log ERRORs
- log WARNings
- low memory

Also added a "status" CLI command to check for exits
and errors.

Change-Id: I6e65fe49619e338f1827fc616ca52a82de7ba470
......@@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch
from mininet.nodelib import LinuxBridge
from mininet.net import Mininet
from mininet.topo import SingleSwitchTopo, Topo
from mininet.log import setLogLevel, info
from mininet.log import setLogLevel, info, warn, error, debug
from mininet.cli import CLI
from mininet.util import quietRun, waitListening
from mininet.clean import killprocs
from mininet.util import quietRun, specialClass
from mininet.examples.controlnet import MininetFacade
from os import environ
......@@ -55,7 +54,7 @@ from os.path import dirname, join, isfile
from sys import argv
from glob import glob
import time
from functools import partial
### ONOS Environment
......@@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ):
return onosDir
def waitListening( client=None, server='127.0.0.1', port=80, timeout=None,
callback=None, sleepSecs=.5 ):
"Modified mininet.util.waitListening with callback, sleepSecs"
runCmd = ( client.cmd if client else
partial( quietRun, shell=True ) )
if not runCmd( 'which telnet' ):
raise Exception('Could not find telnet' )
# pylint: disable=maybe-no-member
serverIP = server if isinstance( server, basestring ) else server.IP()
cmd = ( 'echo A | telnet -e A %s %s' % ( serverIP, port ) )
elapsed = 0
result = runCmd( cmd )
while 'Connected' not in result:
if 'No route' in result:
rtable = runCmd( 'route' )
error( 'no route to %s:\n%s' % ( server, rtable ) )
return False
if timeout and elapsed >= timeout:
error( 'could not connect to %s on port %d\n' % ( server, port ) )
return False
debug( 'waiting for', server, 'to listen on port', port, '\n' )
info( '.' )
if callback:
callback()
time.sleep( sleepSecs )
elapsed += sleepSecs
result = runCmd( cmd )
return True
### Mininet classes
def RenamedTopo( topo, *args, **kwargs ):
......@@ -191,7 +220,9 @@ class ONOSNode( Controller ):
"ONOS cluster node"
def __init__( self, name, **kwargs ):
"alertAction: exception|ignore|warn|exit (exception)"
kwargs.update( inNamespace=True )
self.alertAction = kwargs.pop( 'alertAction', 'exception' )
Controller.__init__( self, name, **kwargs )
self.dir = '/tmp/%s' % self.name
self.client = self.dir + '/karaf/bin/client'
......@@ -220,6 +251,7 @@ class ONOSNode( Controller ):
self.ucmd( service, 'server 1>../onos.log 2>../onos.log'
' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' )
self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() )
self.warningCount = 0
# pylint: enable=arguments-differ
......@@ -228,15 +260,66 @@ class ONOSNode( Controller ):
self.cmd( 'pkill -HUP -f karaf.jar && wait' )
self.cmd( 'rm -rf', self.dir )
def sanityAlert( self, *args ):
"Alert to raise on sanityCheck failure"
info( '\n' )
if self.alertAction == 'exception':
raise Exception( *args )
if self.alertAction == 'warn':
warn( *args + ( '\n', ) )
elif self.alertAction == 'exit':
error( '***', *args +
( '\nExiting. Run "sudo mn -c" to clean up.\n', ) )
exit( 1 )
def isRunning( self ):
"Is our ONOS process still running?"
cmd = 'ps -p %d >/dev/null 2>&1 && echo "running" || echo "not running"'
return self.cmd( cmd % self.onosPid ) == 'running'
def sanityCheck( self ):
"Check whether we've quit or are running out of memory"
cmd = ( 'ps -p %d >/dev/null 2>&1 && echo "running" ||'
'echo "not running"' )
return self.cmd( cmd % self.onosPid ).strip() == 'running'
def checkLog( self ):
"Return log file errors and warnings"
log = join( self.dir, 'log' )
errors, warnings = None, None
if isfile( log ):
lines = open( log ).read().split( '\n' )
errors = [ line for line in lines if 'ERROR' in line ]
warnings = [ line for line in lines if 'WARN'in line ]
return errors, warnings
def memAvailable( self ):
"Return available memory in KB (or -1 if we can't tell)"
lines = open( '/proc/meminfo' ).read().strip().split( '\n' )
entries = map( str.split, lines )
index = { entry[ 0 ]: entry for entry in entries }
# Check MemAvailable if present
default = ( None, '-1', 'kB' )
_name, count, unit = index.get( 'MemAvailable:', default )
if unit.lower() == 'kb':
return int( count )
return -1
def sanityCheck( self, lowMem=100000 ):
"""Check whether we've quit or are running out of memory
lowMem: low memory threshold in KB (100000)"""
# Are we still running?
if not self.isRunning():
raise Exception( 'ONOS node %s has died' % self.name )
self.sanityAlert( 'ONOS node %s has died' % self.name )
# Are there errors in the log file?
errors, warnings = self.checkLog()
if errors:
self.sanityAlert( 'ONOS startup errors:\n<<%s>>' %
'\n'.join( errors ) )
warningCount = len( warnings )
if warnings and warningCount > self.warningCount:
warn( '(%d warnings)' % len( warnings ) )
self.warningCount = warningCount
# Are we running out of memory?
mem = self.memAvailable()
if mem > 0 and mem < lowMem:
self.sanityAlert( 'Running out of memory (only %d KB available)'
% mem )
def waitStarted( self ):
"Wait until we've really started"
......@@ -246,11 +329,13 @@ class ONOSNode( Controller ):
if 'running' in status and 'not running' not in status:
break
info( '.' )
self.sanityCheck()
time.sleep( 1 )
info( ' ssh-port' )
waitListening( server=self, port=KarafPort )
waitListening( server=self, port=KarafPort, callback=self.sanityCheck )
info( ' openflow-port' )
waitListening( server=self, port=OpenFlowPort )
waitListening( server=self, port=OpenFlowPort,
callback=self.sanityCheck )
info( ' client' )
while True:
result = quietRun( 'echo apps -a | %s -h %s' %
......@@ -258,6 +343,7 @@ class ONOSNode( Controller ):
if 'openflow' in result:
break
info( '.' )
self.sanityCheck()
time.sleep( 1 )
info( ')\n' )
......@@ -284,11 +370,13 @@ class ONOSCluster( Controller ):
ipBase: IP range for ONOS nodes
forward: default port forwarding list,
topo: topology class or instance
nodeOpts: ONOSNode options
**kwargs: additional topology parameters"""
args = list( args )
name = args.pop( 0 )
topo = kwargs.pop( 'topo', None )
nat = kwargs.pop( 'nat', 'nat0' )
nodeOpts = kwargs.pop( 'nodeOpts', {} )
# Default: single switch with 1 ONOS node
if not topo:
topo = SingleSwitchTopo
......@@ -303,7 +391,8 @@ class ONOSCluster( Controller ):
fixIPTables()
self.env = initONOSEnv()
self.net = Mininet( topo=topo, ipBase=self.ipBase,
host=ONOSNode, switch=LinuxBridge,
host=partial( ONOSNode, **nodeOpts ),
switch=LinuxBridge,
controller=None )
if nat:
self.net.addNAT( nat ).configDefault()
......@@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ):
"Run tail -f /tmp/onos1/log; press control-C to stop"
self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() )
def do_status( self, line ):
"Return status of ONOS cluster(s)"
for c in self.mn.controllers:
if isinstance( c, ONOSCluster ):
for node in c.net.hosts:
if isinstance( node, ONOSNode ):
errors, warnings = node.checkLog()
running = ( 'Running' if node.isRunning()
else 'Exited' )
status = ''
if errors:
status += '%d ERRORS ' % len( errors )
if warnings:
status += '%d warnings' % len( warnings )
status = status if status else 'OK'
info( node, '\t', running, '\t', status, '\n' )
# For interactive use, exit on error
exitOnError = dict( nodeOpts={ 'alertAction': 'exit' } )
ONOSClusterInteractive = specialClass( ONOSCluster, defaults=exitOnError )
### Exports for bin/mn
CLI = ONOSCLI
controllers = { 'onos': ONOSCluster, 'default': ONOSCluster }
controllers = { 'onos': ONOSClusterInteractive,
'default': ONOSClusterInteractive }
# XXX Hack to change default controller as above doesn't work
findController = lambda: ONOSCluster
findController = lambda: controllers[ 'default' ]
switches = { 'onos': ONOSOVSSwitch,
'onosovs': ONOSOVSSwitch,
......