Committed by
Brian O'Connor
Add status/error checks during ONOS startup
We check for: - process exits - log ERRORs - log WARNings - low memory Also added a "status" CLI command to check for exits and errors. Change-Id: I6e65fe49619e338f1827fc616ca52a82de7ba470
Showing
1 changed file
with
127 additions
and
16 deletions
... | @@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch | ... | @@ -44,10 +44,9 @@ from mininet.node import Controller, OVSSwitch, UserSwitch |
44 | from mininet.nodelib import LinuxBridge | 44 | from mininet.nodelib import LinuxBridge |
45 | from mininet.net import Mininet | 45 | from mininet.net import Mininet |
46 | from mininet.topo import SingleSwitchTopo, Topo | 46 | from mininet.topo import SingleSwitchTopo, Topo |
47 | -from mininet.log import setLogLevel, info | 47 | +from mininet.log import setLogLevel, info, warn, error, debug |
48 | from mininet.cli import CLI | 48 | from mininet.cli import CLI |
49 | -from mininet.util import quietRun, waitListening | 49 | +from mininet.util import quietRun, specialClass |
50 | -from mininet.clean import killprocs | ||
51 | from mininet.examples.controlnet import MininetFacade | 50 | from mininet.examples.controlnet import MininetFacade |
52 | 51 | ||
53 | from os import environ | 52 | from os import environ |
... | @@ -55,7 +54,7 @@ from os.path import dirname, join, isfile | ... | @@ -55,7 +54,7 @@ from os.path import dirname, join, isfile |
55 | from sys import argv | 54 | from sys import argv |
56 | from glob import glob | 55 | from glob import glob |
57 | import time | 56 | import time |
58 | - | 57 | +from functools import partial |
59 | 58 | ||
60 | ### ONOS Environment | 59 | ### ONOS Environment |
61 | 60 | ||
... | @@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ): | ... | @@ -150,6 +149,36 @@ def unpackONOS( destDir='/tmp', run=quietRun ): |
150 | return onosDir | 149 | return onosDir |
151 | 150 | ||
152 | 151 | ||
152 | +def waitListening( client=None, server='127.0.0.1', port=80, timeout=None, | ||
153 | + callback=None, sleepSecs=.5 ): | ||
154 | + "Modified mininet.util.waitListening with callback, sleepSecs" | ||
155 | + runCmd = ( client.cmd if client else | ||
156 | + partial( quietRun, shell=True ) ) | ||
157 | + if not runCmd( 'which telnet' ): | ||
158 | + raise Exception('Could not find telnet' ) | ||
159 | + # pylint: disable=maybe-no-member | ||
160 | + serverIP = server if isinstance( server, basestring ) else server.IP() | ||
161 | + cmd = ( 'echo A | telnet -e A %s %s' % ( serverIP, port ) ) | ||
162 | + elapsed = 0 | ||
163 | + result = runCmd( cmd ) | ||
164 | + while 'Connected' not in result: | ||
165 | + if 'No route' in result: | ||
166 | + rtable = runCmd( 'route' ) | ||
167 | + error( 'no route to %s:\n%s' % ( server, rtable ) ) | ||
168 | + return False | ||
169 | + if timeout and elapsed >= timeout: | ||
170 | + error( 'could not connect to %s on port %d\n' % ( server, port ) ) | ||
171 | + return False | ||
172 | + debug( 'waiting for', server, 'to listen on port', port, '\n' ) | ||
173 | + info( '.' ) | ||
174 | + if callback: | ||
175 | + callback() | ||
176 | + time.sleep( sleepSecs ) | ||
177 | + elapsed += sleepSecs | ||
178 | + result = runCmd( cmd ) | ||
179 | + return True | ||
180 | + | ||
181 | + | ||
153 | ### Mininet classes | 182 | ### Mininet classes |
154 | 183 | ||
155 | def RenamedTopo( topo, *args, **kwargs ): | 184 | def RenamedTopo( topo, *args, **kwargs ): |
... | @@ -191,7 +220,9 @@ class ONOSNode( Controller ): | ... | @@ -191,7 +220,9 @@ class ONOSNode( Controller ): |
191 | "ONOS cluster node" | 220 | "ONOS cluster node" |
192 | 221 | ||
193 | def __init__( self, name, **kwargs ): | 222 | def __init__( self, name, **kwargs ): |
223 | + "alertAction: exception|ignore|warn|exit (exception)" | ||
194 | kwargs.update( inNamespace=True ) | 224 | kwargs.update( inNamespace=True ) |
225 | + self.alertAction = kwargs.pop( 'alertAction', 'exception' ) | ||
195 | Controller.__init__( self, name, **kwargs ) | 226 | Controller.__init__( self, name, **kwargs ) |
196 | self.dir = '/tmp/%s' % self.name | 227 | self.dir = '/tmp/%s' % self.name |
197 | self.client = self.dir + '/karaf/bin/client' | 228 | self.client = self.dir + '/karaf/bin/client' |
... | @@ -220,6 +251,7 @@ class ONOSNode( Controller ): | ... | @@ -220,6 +251,7 @@ class ONOSNode( Controller ): |
220 | self.ucmd( service, 'server 1>../onos.log 2>../onos.log' | 251 | self.ucmd( service, 'server 1>../onos.log 2>../onos.log' |
221 | ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' ) | 252 | ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' ) |
222 | self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() ) | 253 | self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() ) |
254 | + self.warningCount = 0 | ||
223 | 255 | ||
224 | # pylint: enable=arguments-differ | 256 | # pylint: enable=arguments-differ |
225 | 257 | ||
... | @@ -228,15 +260,66 @@ class ONOSNode( Controller ): | ... | @@ -228,15 +260,66 @@ class ONOSNode( Controller ): |
228 | self.cmd( 'pkill -HUP -f karaf.jar && wait' ) | 260 | self.cmd( 'pkill -HUP -f karaf.jar && wait' ) |
229 | self.cmd( 'rm -rf', self.dir ) | 261 | self.cmd( 'rm -rf', self.dir ) |
230 | 262 | ||
263 | + def sanityAlert( self, *args ): | ||
264 | + "Alert to raise on sanityCheck failure" | ||
265 | + info( '\n' ) | ||
266 | + if self.alertAction == 'exception': | ||
267 | + raise Exception( *args ) | ||
268 | + if self.alertAction == 'warn': | ||
269 | + warn( *args + ( '\n', ) ) | ||
270 | + elif self.alertAction == 'exit': | ||
271 | + error( '***', *args + | ||
272 | + ( '\nExiting. Run "sudo mn -c" to clean up.\n', ) ) | ||
273 | + exit( 1 ) | ||
274 | + | ||
231 | def isRunning( self ): | 275 | def isRunning( self ): |
232 | "Is our ONOS process still running?" | 276 | "Is our ONOS process still running?" |
233 | - cmd = 'ps -p %d >/dev/null 2>&1 && echo "running" || echo "not running"' | 277 | + cmd = ( 'ps -p %d >/dev/null 2>&1 && echo "running" ||' |
234 | - return self.cmd( cmd % self.onosPid ) == 'running' | 278 | + 'echo "not running"' ) |
235 | - | 279 | + return self.cmd( cmd % self.onosPid ).strip() == 'running' |
236 | - def sanityCheck( self ): | 280 | + |
237 | - "Check whether we've quit or are running out of memory" | 281 | + def checkLog( self ): |
282 | + "Return log file errors and warnings" | ||
283 | + log = join( self.dir, 'log' ) | ||
284 | + errors, warnings = None, None | ||
285 | + if isfile( log ): | ||
286 | + lines = open( log ).read().split( '\n' ) | ||
287 | + errors = [ line for line in lines if 'ERROR' in line ] | ||
288 | + warnings = [ line for line in lines if 'WARN'in line ] | ||
289 | + return errors, warnings | ||
290 | + | ||
291 | + def memAvailable( self ): | ||
292 | + "Return available memory in KB (or -1 if we can't tell)" | ||
293 | + lines = open( '/proc/meminfo' ).read().strip().split( '\n' ) | ||
294 | + entries = map( str.split, lines ) | ||
295 | + index = { entry[ 0 ]: entry for entry in entries } | ||
296 | + # Check MemAvailable if present | ||
297 | + default = ( None, '-1', 'kB' ) | ||
298 | + _name, count, unit = index.get( 'MemAvailable:', default ) | ||
299 | + if unit.lower() == 'kb': | ||
300 | + return int( count ) | ||
301 | + return -1 | ||
302 | + | ||
303 | + def sanityCheck( self, lowMem=100000 ): | ||
304 | + """Check whether we've quit or are running out of memory | ||
305 | + lowMem: low memory threshold in KB (100000)""" | ||
306 | + # Are we still running? | ||
238 | if not self.isRunning(): | 307 | if not self.isRunning(): |
239 | - raise Exception( 'ONOS node %s has died' % self.name ) | 308 | + self.sanityAlert( 'ONOS node %s has died' % self.name ) |
309 | + # Are there errors in the log file? | ||
310 | + errors, warnings = self.checkLog() | ||
311 | + if errors: | ||
312 | + self.sanityAlert( 'ONOS startup errors:\n<<%s>>' % | ||
313 | + '\n'.join( errors ) ) | ||
314 | + warningCount = len( warnings ) | ||
315 | + if warnings and warningCount > self.warningCount: | ||
316 | + warn( '(%d warnings)' % len( warnings ) ) | ||
317 | + self.warningCount = warningCount | ||
318 | + # Are we running out of memory? | ||
319 | + mem = self.memAvailable() | ||
320 | + if mem > 0 and mem < lowMem: | ||
321 | + self.sanityAlert( 'Running out of memory (only %d KB available)' | ||
322 | + % mem ) | ||
240 | 323 | ||
241 | def waitStarted( self ): | 324 | def waitStarted( self ): |
242 | "Wait until we've really started" | 325 | "Wait until we've really started" |
... | @@ -246,11 +329,13 @@ class ONOSNode( Controller ): | ... | @@ -246,11 +329,13 @@ class ONOSNode( Controller ): |
246 | if 'running' in status and 'not running' not in status: | 329 | if 'running' in status and 'not running' not in status: |
247 | break | 330 | break |
248 | info( '.' ) | 331 | info( '.' ) |
332 | + self.sanityCheck() | ||
249 | time.sleep( 1 ) | 333 | time.sleep( 1 ) |
250 | info( ' ssh-port' ) | 334 | info( ' ssh-port' ) |
251 | - waitListening( server=self, port=KarafPort ) | 335 | + waitListening( server=self, port=KarafPort, callback=self.sanityCheck ) |
252 | info( ' openflow-port' ) | 336 | info( ' openflow-port' ) |
253 | - waitListening( server=self, port=OpenFlowPort ) | 337 | + waitListening( server=self, port=OpenFlowPort, |
338 | + callback=self.sanityCheck ) | ||
254 | info( ' client' ) | 339 | info( ' client' ) |
255 | while True: | 340 | while True: |
256 | result = quietRun( 'echo apps -a | %s -h %s' % | 341 | result = quietRun( 'echo apps -a | %s -h %s' % |
... | @@ -258,6 +343,7 @@ class ONOSNode( Controller ): | ... | @@ -258,6 +343,7 @@ class ONOSNode( Controller ): |
258 | if 'openflow' in result: | 343 | if 'openflow' in result: |
259 | break | 344 | break |
260 | info( '.' ) | 345 | info( '.' ) |
346 | + self.sanityCheck() | ||
261 | time.sleep( 1 ) | 347 | time.sleep( 1 ) |
262 | info( ')\n' ) | 348 | info( ')\n' ) |
263 | 349 | ||
... | @@ -284,11 +370,13 @@ class ONOSCluster( Controller ): | ... | @@ -284,11 +370,13 @@ class ONOSCluster( Controller ): |
284 | ipBase: IP range for ONOS nodes | 370 | ipBase: IP range for ONOS nodes |
285 | forward: default port forwarding list, | 371 | forward: default port forwarding list, |
286 | topo: topology class or instance | 372 | topo: topology class or instance |
373 | + nodeOpts: ONOSNode options | ||
287 | **kwargs: additional topology parameters""" | 374 | **kwargs: additional topology parameters""" |
288 | args = list( args ) | 375 | args = list( args ) |
289 | name = args.pop( 0 ) | 376 | name = args.pop( 0 ) |
290 | topo = kwargs.pop( 'topo', None ) | 377 | topo = kwargs.pop( 'topo', None ) |
291 | nat = kwargs.pop( 'nat', 'nat0' ) | 378 | nat = kwargs.pop( 'nat', 'nat0' ) |
379 | + nodeOpts = kwargs.pop( 'nodeOpts', {} ) | ||
292 | # Default: single switch with 1 ONOS node | 380 | # Default: single switch with 1 ONOS node |
293 | if not topo: | 381 | if not topo: |
294 | topo = SingleSwitchTopo | 382 | topo = SingleSwitchTopo |
... | @@ -303,7 +391,8 @@ class ONOSCluster( Controller ): | ... | @@ -303,7 +391,8 @@ class ONOSCluster( Controller ): |
303 | fixIPTables() | 391 | fixIPTables() |
304 | self.env = initONOSEnv() | 392 | self.env = initONOSEnv() |
305 | self.net = Mininet( topo=topo, ipBase=self.ipBase, | 393 | self.net = Mininet( topo=topo, ipBase=self.ipBase, |
306 | - host=ONOSNode, switch=LinuxBridge, | 394 | + host=partial( ONOSNode, **nodeOpts ), |
395 | + switch=LinuxBridge, | ||
307 | controller=None ) | 396 | controller=None ) |
308 | if nat: | 397 | if nat: |
309 | self.net.addNAT( nat ).configDefault() | 398 | self.net.addNAT( nat ).configDefault() |
... | @@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ): | ... | @@ -441,15 +530,37 @@ class ONOSCLI( OldCLI ): |
441 | "Run tail -f /tmp/onos1/log; press control-C to stop" | 530 | "Run tail -f /tmp/onos1/log; press control-C to stop" |
442 | self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() ) | 531 | self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() ) |
443 | 532 | ||
533 | + def do_status( self, line ): | ||
534 | + "Return status of ONOS cluster(s)" | ||
535 | + for c in self.mn.controllers: | ||
536 | + if isinstance( c, ONOSCluster ): | ||
537 | + for node in c.net.hosts: | ||
538 | + if isinstance( node, ONOSNode ): | ||
539 | + errors, warnings = node.checkLog() | ||
540 | + running = ( 'Running' if node.isRunning() | ||
541 | + else 'Exited' ) | ||
542 | + status = '' | ||
543 | + if errors: | ||
544 | + status += '%d ERRORS ' % len( errors ) | ||
545 | + if warnings: | ||
546 | + status += '%d warnings' % len( warnings ) | ||
547 | + status = status if status else 'OK' | ||
548 | + info( node, '\t', running, '\t', status, '\n' ) | ||
549 | + | ||
550 | + | ||
551 | +# For interactive use, exit on error | ||
552 | +exitOnError = dict( nodeOpts={ 'alertAction': 'exit' } ) | ||
553 | +ONOSClusterInteractive = specialClass( ONOSCluster, defaults=exitOnError ) | ||
554 | + | ||
444 | 555 | ||
445 | ### Exports for bin/mn | 556 | ### Exports for bin/mn |
446 | 557 | ||
447 | CLI = ONOSCLI | 558 | CLI = ONOSCLI |
448 | - | 559 | +controllers = { 'onos': ONOSClusterInteractive, |
449 | -controllers = { 'onos': ONOSCluster, 'default': ONOSCluster } | 560 | + 'default': ONOSClusterInteractive } |
450 | 561 | ||
451 | # XXX Hack to change default controller as above doesn't work | 562 | # XXX Hack to change default controller as above doesn't work |
452 | -findController = lambda: ONOSCluster | 563 | +findController = lambda: controllers[ 'default' ] |
453 | 564 | ||
454 | switches = { 'onos': ONOSOVSSwitch, | 565 | switches = { 'onos': ONOSOVSSwitch, |
455 | 'onosovs': ONOSOVSSwitch, | 566 | 'onosovs': ONOSOVSSwitch, | ... | ... |
-
Please register or login to post a comment