Bob Lantz
Committed by Brian O'Connor

Add status/error checks during ONOS startup

We check for:
- process exits
- log ERRORs
- log WARNings
- low memory

Also added a "status" CLI command to check for exits
and errors.

Change-Id: I6e65fe49619e338f1827fc616ca52a82de7ba470
...@@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch ...@@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch
44 from mininet.nodelib import LinuxBridge 44 from mininet.nodelib import LinuxBridge
45 from mininet.net import Mininet 45 from mininet.net import Mininet
46 from mininet.topo import SingleSwitchTopo, Topo 46 from mininet.topo import SingleSwitchTopo, Topo
47 -from mininet.log import setLogLevel, info 47 +from mininet.log import setLogLevel, info, warn, error, debug
48 from mininet.cli import CLI 48 from mininet.cli import CLI
49 -from mininet.util import quietRun, waitListening 49 +from mininet.util import quietRun, specialClass
50 -from mininet.clean import killprocs
51 from mininet.examples.controlnet import MininetFacade 50 from mininet.examples.controlnet import MininetFacade
52 51
53 from os import environ 52 from os import environ
...@@ -55,7 +54,7 @@ from os.path import dirname, join, isfile ...@@ -55,7 +54,7 @@ from os.path import dirname, join, isfile
55 from sys import argv 54 from sys import argv
56 from glob import glob 55 from glob import glob
57 import time 56 import time
58 - 57 +from functools import partial
59 58
60 ### ONOS Environment 59 ### ONOS Environment
61 60
...@@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ): ...@@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ):
150 return onosDir 149 return onosDir
151 150
152 151
152 +def waitListening( client=None, server='127.0.0.1', port=80, timeout=None,
153 + callback=None, sleepSecs=.5 ):
154 + "Modified mininet.util.waitListening with callback, sleepSecs"
155 + runCmd = ( client.cmd if client else
156 + partial( quietRun, shell=True ) )
157 + if not runCmd( 'which telnet' ):
158 + raise Exception('Could not find telnet' )
159 + # pylint: disable=maybe-no-member
160 + serverIP = server if isinstance( server, basestring ) else server.IP()
161 + cmd = ( 'echo A | telnet -e A %s %s' % ( serverIP, port ) )
162 + elapsed = 0
163 + result = runCmd( cmd )
164 + while 'Connected' not in result:
165 + if 'No route' in result:
166 + rtable = runCmd( 'route' )
167 + error( 'no route to %s:\n%s' % ( server, rtable ) )
168 + return False
169 + if timeout and elapsed >= timeout:
170 + error( 'could not connect to %s on port %d\n' % ( server, port ) )
171 + return False
172 + debug( 'waiting for', server, 'to listen on port', port, '\n' )
173 + info( '.' )
174 + if callback:
175 + callback()
176 + time.sleep( sleepSecs )
177 + elapsed += sleepSecs
178 + result = runCmd( cmd )
179 + return True
180 +
181 +
153 ### Mininet classes 182 ### Mininet classes
154 183
155 def RenamedTopo( topo, *args, **kwargs ): 184 def RenamedTopo( topo, *args, **kwargs ):
...@@ -191,7 +220,9 @@ class ONOSNode( Controller ): ...@@ -191,7 +220,9 @@ class ONOSNode( Controller ):
191 "ONOS cluster node" 220 "ONOS cluster node"
192 221
193 def __init__( self, name, **kwargs ): 222 def __init__( self, name, **kwargs ):
223 + "alertAction: exception|ignore|warn|exit (exception)"
194 kwargs.update( inNamespace=True ) 224 kwargs.update( inNamespace=True )
225 + self.alertAction = kwargs.pop( 'alertAction', 'exception' )
195 Controller.__init__( self, name, **kwargs ) 226 Controller.__init__( self, name, **kwargs )
196 self.dir = '/tmp/%s' % self.name 227 self.dir = '/tmp/%s' % self.name
197 self.client = self.dir + '/karaf/bin/client' 228 self.client = self.dir + '/karaf/bin/client'
...@@ -220,6 +251,7 @@ class ONOSNode( Controller ): ...@@ -220,6 +251,7 @@ class ONOSNode( Controller ):
220 self.ucmd( service, 'server 1>../onos.log 2>../onos.log' 251 self.ucmd( service, 'server 1>../onos.log 2>../onos.log'
221 ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' ) 252 ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' )
222 self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() ) 253 self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() )
254 + self.warningCount = 0
223 255
224 # pylint: enable=arguments-differ 256 # pylint: enable=arguments-differ
225 257
...@@ -228,15 +260,66 @@ class ONOSNode( Controller ): ...@@ -228,15 +260,66 @@ class ONOSNode( Controller ):
228 self.cmd( 'pkill -HUP -f karaf.jar && wait' ) 260 self.cmd( 'pkill -HUP -f karaf.jar && wait' )
229 self.cmd( 'rm -rf', self.dir ) 261 self.cmd( 'rm -rf', self.dir )
230 262
263 + def sanityAlert( self, *args ):
264 + "Alert to raise on sanityCheck failure"
265 + info( '\n' )
266 + if self.alertAction == 'exception':
267 + raise Exception( *args )
268 + if self.alertAction == 'warn':
269 + warn( *args + ( '\n', ) )
270 + elif self.alertAction == 'exit':
271 + error( '***', *args +
272 + ( '\nExiting. Run "sudo mn -c" to clean up.\n', ) )
273 + exit( 1 )
274 +
231 def isRunning( self ): 275 def isRunning( self ):
232 "Is our ONOS process still running?" 276 "Is our ONOS process still running?"
233 - cmd = 'ps -p %d >/dev/null 2>&1 && echo "running" || echo "not running"' 277 + cmd = ( 'ps -p %d >/dev/null 2>&1 && echo "running" ||'
234 - return self.cmd( cmd % self.onosPid ) == 'running' 278 + 'echo "not running"' )
235 - 279 + return self.cmd( cmd % self.onosPid ).strip() == 'running'
236 - def sanityCheck( self ): 280 +
237 - "Check whether we've quit or are running out of memory" 281 + def checkLog( self ):
282 + "Return log file errors and warnings"
283 + log = join( self.dir, 'log' )
284 + errors, warnings = None, None
285 + if isfile( log ):
286 + lines = open( log ).read().split( '\n' )
287 + errors = [ line for line in lines if 'ERROR' in line ]
288 + warnings = [ line for line in lines if 'WARN'in line ]
289 + return errors, warnings
290 +
291 + def memAvailable( self ):
292 + "Return available memory in KB (or -1 if we can't tell)"
293 + lines = open( '/proc/meminfo' ).read().strip().split( '\n' )
294 + entries = map( str.split, lines )
295 + index = { entry[ 0 ]: entry for entry in entries }
296 + # Check MemAvailable if present
297 + default = ( None, '-1', 'kB' )
298 + _name, count, unit = index.get( 'MemAvailable:', default )
299 + if unit.lower() == 'kb':
300 + return int( count )
301 + return -1
302 +
303 + def sanityCheck( self, lowMem=100000 ):
304 + """Check whether we've quit or are running out of memory
305 + lowMem: low memory threshold in KB (100000)"""
306 + # Are we still running?
238 if not self.isRunning(): 307 if not self.isRunning():
239 - raise Exception( 'ONOS node %s has died' % self.name ) 308 + self.sanityAlert( 'ONOS node %s has died' % self.name )
309 + # Are there errors in the log file?
310 + errors, warnings = self.checkLog()
311 + if errors:
312 + self.sanityAlert( 'ONOS startup errors:\n<<%s>>' %
313 + '\n'.join( errors ) )
314 + warningCount = len( warnings )
315 + if warnings and warningCount > self.warningCount:
316 + warn( '(%d warnings)' % len( warnings ) )
317 + self.warningCount = warningCount
318 + # Are we running out of memory?
319 + mem = self.memAvailable()
320 + if mem > 0 and mem < lowMem:
321 + self.sanityAlert( 'Running out of memory (only %d KB available)'
322 + % mem )
240 323
241 def waitStarted( self ): 324 def waitStarted( self ):
242 "Wait until we've really started" 325 "Wait until we've really started"
...@@ -246,11 +329,13 @@ class ONOSNode( Controller ): ...@@ -246,11 +329,13 @@ class ONOSNode( Controller ):
246 if 'running' in status and 'not running' not in status: 329 if 'running' in status and 'not running' not in status:
247 break 330 break
248 info( '.' ) 331 info( '.' )
332 + self.sanityCheck()
249 time.sleep( 1 ) 333 time.sleep( 1 )
250 info( ' ssh-port' ) 334 info( ' ssh-port' )
251 - waitListening( server=self, port=KarafPort ) 335 + waitListening( server=self, port=KarafPort, callback=self.sanityCheck )
252 info( ' openflow-port' ) 336 info( ' openflow-port' )
253 - waitListening( server=self, port=OpenFlowPort ) 337 + waitListening( server=self, port=OpenFlowPort,
338 + callback=self.sanityCheck )
254 info( ' client' ) 339 info( ' client' )
255 while True: 340 while True:
256 result = quietRun( 'echo apps -a | %s -h %s' % 341 result = quietRun( 'echo apps -a | %s -h %s' %
...@@ -258,6 +343,7 @@ class ONOSNode( Controller ): ...@@ -258,6 +343,7 @@ class ONOSNode( Controller ):
258 if 'openflow' in result: 343 if 'openflow' in result:
259 break 344 break
260 info( '.' ) 345 info( '.' )
346 + self.sanityCheck()
261 time.sleep( 1 ) 347 time.sleep( 1 )
262 info( ')\n' ) 348 info( ')\n' )
263 349
...@@ -284,11 +370,13 @@ class ONOSCluster( Controller ): ...@@ -284,11 +370,13 @@ class ONOSCluster( Controller ):
284 ipBase: IP range for ONOS nodes 370 ipBase: IP range for ONOS nodes
285 forward: default port forwarding list, 371 forward: default port forwarding list,
286 topo: topology class or instance 372 topo: topology class or instance
373 + nodeOpts: ONOSNode options
287 **kwargs: additional topology parameters""" 374 **kwargs: additional topology parameters"""
288 args = list( args ) 375 args = list( args )
289 name = args.pop( 0 ) 376 name = args.pop( 0 )
290 topo = kwargs.pop( 'topo', None ) 377 topo = kwargs.pop( 'topo', None )
291 nat = kwargs.pop( 'nat', 'nat0' ) 378 nat = kwargs.pop( 'nat', 'nat0' )
379 + nodeOpts = kwargs.pop( 'nodeOpts', {} )
292 # Default: single switch with 1 ONOS node 380 # Default: single switch with 1 ONOS node
293 if not topo: 381 if not topo:
294 topo = SingleSwitchTopo 382 topo = SingleSwitchTopo
...@@ -303,7 +391,8 @@ class ONOSCluster( Controller ): ...@@ -303,7 +391,8 @@ class ONOSCluster( Controller ):
303 fixIPTables() 391 fixIPTables()
304 self.env = initONOSEnv() 392 self.env = initONOSEnv()
305 self.net = Mininet( topo=topo, ipBase=self.ipBase, 393 self.net = Mininet( topo=topo, ipBase=self.ipBase,
306 - host=ONOSNode, switch=LinuxBridge, 394 + host=partial( ONOSNode, **nodeOpts ),
395 + switch=LinuxBridge,
307 controller=None ) 396 controller=None )
308 if nat: 397 if nat:
309 self.net.addNAT( nat ).configDefault() 398 self.net.addNAT( nat ).configDefault()
...@@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ): ...@@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ):
441 "Run tail -f /tmp/onos1/log; press control-C to stop" 530 "Run tail -f /tmp/onos1/log; press control-C to stop"
442 self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() ) 531 self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() )
443 532
533 + def do_status( self, line ):
534 + "Return status of ONOS cluster(s)"
535 + for c in self.mn.controllers:
536 + if isinstance( c, ONOSCluster ):
537 + for node in c.net.hosts:
538 + if isinstance( node, ONOSNode ):
539 + errors, warnings = node.checkLog()
540 + running = ( 'Running' if node.isRunning()
541 + else 'Exited' )
542 + status = ''
543 + if errors:
544 + status += '%d ERRORS ' % len( errors )
545 + if warnings:
546 + status += '%d warnings' % len( warnings )
547 + status = status if status else 'OK'
548 + info( node, '\t', running, '\t', status, '\n' )
549 +
550 +
551 +# For interactive use, exit on error
552 +exitOnError = dict( nodeOpts={ 'alertAction': 'exit' } )
553 +ONOSClusterInteractive = specialClass( ONOSCluster, defaults=exitOnError )
554 +
444 555
445 ### Exports for bin/mn 556 ### Exports for bin/mn
446 557
447 CLI = ONOSCLI 558 CLI = ONOSCLI
448 - 559 +controllers = { 'onos': ONOSClusterInteractive,
449 -controllers = { 'onos': ONOSCluster, 'default': ONOSCluster } 560 + 'default': ONOSClusterInteractive }
450 561
451 # XXX Hack to change default controller as above doesn't work 562 # XXX Hack to change default controller as above doesn't work
452 -findController = lambda: ONOSCluster 563 +findController = lambda: controllers[ 'default' ]
453 564
454 switches = { 'onos': ONOSOVSSwitch, 565 switches = { 'onos': ONOSOVSSwitch,
455 'onosovs': ONOSOVSSwitch, 566 'onosovs': ONOSOVSSwitch,
......