Committed by
Brian O'Connor
Add status/error checks during ONOS startup
We check for: - process exits - log ERRORs - log WARNings - low memory Also added a "status" CLI command to check for exits and errors. Change-Id: I6e65fe49619e338f1827fc616ca52a82de7ba470
Showing
1 changed file
with
127 additions
and
16 deletions
| ... | @@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch | ... | @@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch |
| 44 | from mininet.nodelib import LinuxBridge | 44 | from mininet.nodelib import LinuxBridge |
| 45 | from mininet.net import Mininet | 45 | from mininet.net import Mininet |
| 46 | from mininet.topo import SingleSwitchTopo, Topo | 46 | from mininet.topo import SingleSwitchTopo, Topo |
| 47 | -from mininet.log import setLogLevel, info | 47 | +from mininet.log import setLogLevel, info, warn, error, debug |
| 48 | from mininet.cli import CLI | 48 | from mininet.cli import CLI |
| 49 | -from mininet.util import quietRun, waitListening | 49 | +from mininet.util import quietRun, specialClass |
| 50 | -from mininet.clean import killprocs | ||
| 51 | from mininet.examples.controlnet import MininetFacade | 50 | from mininet.examples.controlnet import MininetFacade |
| 52 | 51 | ||
| 53 | from os import environ | 52 | from os import environ |
| ... | @@ -55,7 +54,7 @@ from os.path import dirname, join, isfile | ... | @@ -55,7 +54,7 @@ from os.path import dirname, join, isfile |
| 55 | from sys import argv | 54 | from sys import argv |
| 56 | from glob import glob | 55 | from glob import glob |
| 57 | import time | 56 | import time |
| 58 | - | 57 | +from functools import partial |
| 59 | 58 | ||
| 60 | ### ONOS Environment | 59 | ### ONOS Environment |
| 61 | 60 | ||
| ... | @@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ): | ... | @@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ): |
| 150 | return onosDir | 149 | return onosDir |
| 151 | 150 | ||
| 152 | 151 | ||
| 152 | +def waitListening( client=None, server='127.0.0.1', port=80, timeout=None, | ||
| 153 | + callback=None, sleepSecs=.5 ): | ||
| 154 | + "Modified mininet.util.waitListening with callback, sleepSecs" | ||
| 155 | + runCmd = ( client.cmd if client else | ||
| 156 | + partial( quietRun, shell=True ) ) | ||
| 157 | + if not runCmd( 'which telnet' ): | ||
| 158 | + raise Exception('Could not find telnet' ) | ||
| 159 | + # pylint: disable=maybe-no-member | ||
| 160 | + serverIP = server if isinstance( server, basestring ) else server.IP() | ||
| 161 | + cmd = ( 'echo A | telnet -e A %s %s' % ( serverIP, port ) ) | ||
| 162 | + elapsed = 0 | ||
| 163 | + result = runCmd( cmd ) | ||
| 164 | + while 'Connected' not in result: | ||
| 165 | + if 'No route' in result: | ||
| 166 | + rtable = runCmd( 'route' ) | ||
| 167 | + error( 'no route to %s:\n%s' % ( server, rtable ) ) | ||
| 168 | + return False | ||
| 169 | + if timeout and elapsed >= timeout: | ||
| 170 | + error( 'could not connect to %s on port %d\n' % ( server, port ) ) | ||
| 171 | + return False | ||
| 172 | + debug( 'waiting for', server, 'to listen on port', port, '\n' ) | ||
| 173 | + info( '.' ) | ||
| 174 | + if callback: | ||
| 175 | + callback() | ||
| 176 | + time.sleep( sleepSecs ) | ||
| 177 | + elapsed += sleepSecs | ||
| 178 | + result = runCmd( cmd ) | ||
| 179 | + return True | ||
| 180 | + | ||
| 181 | + | ||
| 153 | ### Mininet classes | 182 | ### Mininet classes |
| 154 | 183 | ||
| 155 | def RenamedTopo( topo, *args, **kwargs ): | 184 | def RenamedTopo( topo, *args, **kwargs ): |
| ... | @@ -191,7 +220,9 @@ class ONOSNode( Controller ): | ... | @@ -191,7 +220,9 @@ class ONOSNode( Controller ): |
| 191 | "ONOS cluster node" | 220 | "ONOS cluster node" |
| 192 | 221 | ||
| 193 | def __init__( self, name, **kwargs ): | 222 | def __init__( self, name, **kwargs ): |
| 223 | + "alertAction: exception|ignore|warn|exit (exception)" | ||
| 194 | kwargs.update( inNamespace=True ) | 224 | kwargs.update( inNamespace=True ) |
| 225 | + self.alertAction = kwargs.pop( 'alertAction', 'exception' ) | ||
| 195 | Controller.__init__( self, name, **kwargs ) | 226 | Controller.__init__( self, name, **kwargs ) |
| 196 | self.dir = '/tmp/%s' % self.name | 227 | self.dir = '/tmp/%s' % self.name |
| 197 | self.client = self.dir + '/karaf/bin/client' | 228 | self.client = self.dir + '/karaf/bin/client' |
| ... | @@ -220,6 +251,7 @@ class ONOSNode( Controller ): | ... | @@ -220,6 +251,7 @@ class ONOSNode( Controller ): |
| 220 | self.ucmd( service, 'server 1>../onos.log 2>../onos.log' | 251 | self.ucmd( service, 'server 1>../onos.log 2>../onos.log' |
| 221 | ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' ) | 252 | ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' ) |
| 222 | self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() ) | 253 | self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() ) |
| 254 | + self.warningCount = 0 | ||
| 223 | 255 | ||
| 224 | # pylint: enable=arguments-differ | 256 | # pylint: enable=arguments-differ |
| 225 | 257 | ||
| ... | @@ -228,15 +260,66 @@ class ONOSNode( Controller ): | ... | @@ -228,15 +260,66 @@ class ONOSNode( Controller ): |
| 228 | self.cmd( 'pkill -HUP -f karaf.jar && wait' ) | 260 | self.cmd( 'pkill -HUP -f karaf.jar && wait' ) |
| 229 | self.cmd( 'rm -rf', self.dir ) | 261 | self.cmd( 'rm -rf', self.dir ) |
| 230 | 262 | ||
| 263 | + def sanityAlert( self, *args ): | ||
| 264 | + "Alert to raise on sanityCheck failure" | ||
| 265 | + info( '\n' ) | ||
| 266 | + if self.alertAction == 'exception': | ||
| 267 | + raise Exception( *args ) | ||
| 268 | + if self.alertAction == 'warn': | ||
| 269 | + warn( *args + ( '\n', ) ) | ||
| 270 | + elif self.alertAction == 'exit': | ||
| 271 | + error( '***', *args + | ||
| 272 | + ( '\nExiting. Run "sudo mn -c" to clean up.\n', ) ) | ||
| 273 | + exit( 1 ) | ||
| 274 | + | ||
| 231 | def isRunning( self ): | 275 | def isRunning( self ): |
| 232 | "Is our ONOS process still running?" | 276 | "Is our ONOS process still running?" |
| 233 | - cmd = 'ps -p %d >/dev/null 2>&1 && echo "running" || echo "not running"' | 277 | + cmd = ( 'ps -p %d >/dev/null 2>&1 && echo "running" ||' |
| 234 | - return self.cmd( cmd % self.onosPid ) == 'running' | 278 | + 'echo "not running"' ) |
| 235 | - | 279 | + return self.cmd( cmd % self.onosPid ).strip() == 'running' |
| 236 | - def sanityCheck( self ): | 280 | + |
| 237 | - "Check whether we've quit or are running out of memory" | 281 | + def checkLog( self ): |
| 282 | + "Return log file errors and warnings" | ||
| 283 | + log = join( self.dir, 'log' ) | ||
| 284 | + errors, warnings = None, None | ||
| 285 | + if isfile( log ): | ||
| 286 | + lines = open( log ).read().split( '\n' ) | ||
| 287 | + errors = [ line for line in lines if 'ERROR' in line ] | ||
| 288 | + warnings = [ line for line in lines if 'WARN'in line ] | ||
| 289 | + return errors, warnings | ||
| 290 | + | ||
| 291 | + def memAvailable( self ): | ||
| 292 | + "Return available memory in KB (or -1 if we can't tell)" | ||
| 293 | + lines = open( '/proc/meminfo' ).read().strip().split( '\n' ) | ||
| 294 | + entries = map( str.split, lines ) | ||
| 295 | + index = { entry[ 0 ]: entry for entry in entries } | ||
| 296 | + # Check MemAvailable if present | ||
| 297 | + default = ( None, '-1', 'kB' ) | ||
| 298 | + _name, count, unit = index.get( 'MemAvailable:', default ) | ||
| 299 | + if unit.lower() == 'kb': | ||
| 300 | + return int( count ) | ||
| 301 | + return -1 | ||
| 302 | + | ||
| 303 | + def sanityCheck( self, lowMem=100000 ): | ||
| 304 | + """Check whether we've quit or are running out of memory | ||
| 305 | + lowMem: low memory threshold in KB (100000)""" | ||
| 306 | + # Are we still running? | ||
| 238 | if not self.isRunning(): | 307 | if not self.isRunning(): |
| 239 | - raise Exception( 'ONOS node %s has died' % self.name ) | 308 | + self.sanityAlert( 'ONOS node %s has died' % self.name ) |
| 309 | + # Are there errors in the log file? | ||
| 310 | + errors, warnings = self.checkLog() | ||
| 311 | + if errors: | ||
| 312 | + self.sanityAlert( 'ONOS startup errors:\n<<%s>>' % | ||
| 313 | + '\n'.join( errors ) ) | ||
| 314 | + warningCount = len( warnings ) | ||
| 315 | + if warnings and warningCount > self.warningCount: | ||
| 316 | + warn( '(%d warnings)' % len( warnings ) ) | ||
| 317 | + self.warningCount = warningCount | ||
| 318 | + # Are we running out of memory? | ||
| 319 | + mem = self.memAvailable() | ||
| 320 | + if mem > 0 and mem < lowMem: | ||
| 321 | + self.sanityAlert( 'Running out of memory (only %d KB available)' | ||
| 322 | + % mem ) | ||
| 240 | 323 | ||
| 241 | def waitStarted( self ): | 324 | def waitStarted( self ): |
| 242 | "Wait until we've really started" | 325 | "Wait until we've really started" |
| ... | @@ -246,11 +329,13 @@ class ONOSNode( Controller ): | ... | @@ -246,11 +329,13 @@ class ONOSNode( Controller ): |
| 246 | if 'running' in status and 'not running' not in status: | 329 | if 'running' in status and 'not running' not in status: |
| 247 | break | 330 | break |
| 248 | info( '.' ) | 331 | info( '.' ) |
| 332 | + self.sanityCheck() | ||
| 249 | time.sleep( 1 ) | 333 | time.sleep( 1 ) |
| 250 | info( ' ssh-port' ) | 334 | info( ' ssh-port' ) |
| 251 | - waitListening( server=self, port=KarafPort ) | 335 | + waitListening( server=self, port=KarafPort, callback=self.sanityCheck ) |
| 252 | info( ' openflow-port' ) | 336 | info( ' openflow-port' ) |
| 253 | - waitListening( server=self, port=OpenFlowPort ) | 337 | + waitListening( server=self, port=OpenFlowPort, |
| 338 | + callback=self.sanityCheck ) | ||
| 254 | info( ' client' ) | 339 | info( ' client' ) |
| 255 | while True: | 340 | while True: |
| 256 | result = quietRun( 'echo apps -a | %s -h %s' % | 341 | result = quietRun( 'echo apps -a | %s -h %s' % |
| ... | @@ -258,6 +343,7 @@ class ONOSNode( Controller ): | ... | @@ -258,6 +343,7 @@ class ONOSNode( Controller ): |
| 258 | if 'openflow' in result: | 343 | if 'openflow' in result: |
| 259 | break | 344 | break |
| 260 | info( '.' ) | 345 | info( '.' ) |
| 346 | + self.sanityCheck() | ||
| 261 | time.sleep( 1 ) | 347 | time.sleep( 1 ) |
| 262 | info( ')\n' ) | 348 | info( ')\n' ) |
| 263 | 349 | ||
| ... | @@ -284,11 +370,13 @@ class ONOSCluster( Controller ): | ... | @@ -284,11 +370,13 @@ class ONOSCluster( Controller ): |
| 284 | ipBase: IP range for ONOS nodes | 370 | ipBase: IP range for ONOS nodes |
| 285 | forward: default port forwarding list, | 371 | forward: default port forwarding list, |
| 286 | topo: topology class or instance | 372 | topo: topology class or instance |
| 373 | + nodeOpts: ONOSNode options | ||
| 287 | **kwargs: additional topology parameters""" | 374 | **kwargs: additional topology parameters""" |
| 288 | args = list( args ) | 375 | args = list( args ) |
| 289 | name = args.pop( 0 ) | 376 | name = args.pop( 0 ) |
| 290 | topo = kwargs.pop( 'topo', None ) | 377 | topo = kwargs.pop( 'topo', None ) |
| 291 | nat = kwargs.pop( 'nat', 'nat0' ) | 378 | nat = kwargs.pop( 'nat', 'nat0' ) |
| 379 | + nodeOpts = kwargs.pop( 'nodeOpts', {} ) | ||
| 292 | # Default: single switch with 1 ONOS node | 380 | # Default: single switch with 1 ONOS node |
| 293 | if not topo: | 381 | if not topo: |
| 294 | topo = SingleSwitchTopo | 382 | topo = SingleSwitchTopo |
| ... | @@ -303,7 +391,8 @@ class ONOSCluster( Controller ): | ... | @@ -303,7 +391,8 @@ class ONOSCluster( Controller ): |
| 303 | fixIPTables() | 391 | fixIPTables() |
| 304 | self.env = initONOSEnv() | 392 | self.env = initONOSEnv() |
| 305 | self.net = Mininet( topo=topo, ipBase=self.ipBase, | 393 | self.net = Mininet( topo=topo, ipBase=self.ipBase, |
| 306 | - host=ONOSNode, switch=LinuxBridge, | 394 | + host=partial( ONOSNode, **nodeOpts ), |
| 395 | + switch=LinuxBridge, | ||
| 307 | controller=None ) | 396 | controller=None ) |
| 308 | if nat: | 397 | if nat: |
| 309 | self.net.addNAT( nat ).configDefault() | 398 | self.net.addNAT( nat ).configDefault() |
| ... | @@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ): | ... | @@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ): |
| 441 | "Run tail -f /tmp/onos1/log; press control-C to stop" | 530 | "Run tail -f /tmp/onos1/log; press control-C to stop" |
| 442 | self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() ) | 531 | self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() ) |
| 443 | 532 | ||
| 533 | + def do_status( self, line ): | ||
| 534 | + "Return status of ONOS cluster(s)" | ||
| 535 | + for c in self.mn.controllers: | ||
| 536 | + if isinstance( c, ONOSCluster ): | ||
| 537 | + for node in c.net.hosts: | ||
| 538 | + if isinstance( node, ONOSNode ): | ||
| 539 | + errors, warnings = node.checkLog() | ||
| 540 | + running = ( 'Running' if node.isRunning() | ||
| 541 | + else 'Exited' ) | ||
| 542 | + status = '' | ||
| 543 | + if errors: | ||
| 544 | + status += '%d ERRORS ' % len( errors ) | ||
| 545 | + if warnings: | ||
| 546 | + status += '%d warnings' % len( warnings ) | ||
| 547 | + status = status if status else 'OK' | ||
| 548 | + info( node, '\t', running, '\t', status, '\n' ) | ||
| 549 | + | ||
| 550 | + | ||
| 551 | +# For interactive use, exit on error | ||
| 552 | +exitOnError = dict( nodeOpts={ 'alertAction': 'exit' } ) | ||
| 553 | +ONOSClusterInteractive = specialClass( ONOSCluster, defaults=exitOnError ) | ||
| 554 | + | ||
| 444 | 555 | ||
| 445 | ### Exports for bin/mn | 556 | ### Exports for bin/mn |
| 446 | 557 | ||
| 447 | CLI = ONOSCLI | 558 | CLI = ONOSCLI |
| 448 | - | 559 | +controllers = { 'onos': ONOSClusterInteractive, |
| 449 | -controllers = { 'onos': ONOSCluster, 'default': ONOSCluster } | 560 | + 'default': ONOSClusterInteractive } |
| 450 | 561 | ||
| 451 | # XXX Hack to change default controller as above doesn't work | 562 | # XXX Hack to change default controller as above doesn't work |
| 452 | -findController = lambda: ONOSCluster | 563 | +findController = lambda: controllers[ 'default' ] |
| 453 | 564 | ||
| 454 | switches = { 'onos': ONOSOVSSwitch, | 565 | switches = { 'onos': ONOSOVSSwitch, |
| 455 | 'onosovs': ONOSOVSSwitch, | 566 | 'onosovs': ONOSOVSSwitch, | ... | ... |
-
Please register or login to post a comment