Jon Hall
Committed by Gerrit Code Review

Add HA.py

A set of mininet based HA tests based on onos.py
Currently includes the following tests:
    - a control network partitioning test
    - A dynamic cluster scaling test

Change-Id: I9a8e1019dcc51666fee1d933afd66ff390592525
1 +#!/usr/bin/env python
2 +
3 +"""
4 +NOTES
5 +
6 +To change onos log levels before start you can add something similar to
7 +onos.py's ONOSNode.start method before starting onos service:
8 + # Change log levels
9 + self.ucmd( 'echo "log4j.logger.io.atomix= DEBUG" >> $ONOS_HOME/apache-karaf-*/etc/org.ops4j.pax.logging.cfg' )
10 +
11 +"""
12 +
13 +import argparse
14 +from mininet.log import output, info, warn, error, debug, setLogLevel
15 +from mininet.cli import CLI as origCLI
16 +from mininet.net import Mininet
17 +from mininet.topo import SingleSwitchTopo, Topo
18 +from mininet.node import Host
19 +from os.path import join
20 +from glob import glob
21 +import re
22 +import json
23 +from collections import deque
24 +import hashlib
25 +import onos # onos.py
26 +
27 +# Utility functions
28 +
29 +def pause( net, msg, hint=False):
30 + """Reenter the CLI. Note that we use the mn base CLI class to allow
31 + extensibility and combination of custom files"""
32 +
33 + info( msg )
34 +
35 + if hint:
36 + help_msg = "Currently in the root Mininet net namespace...\n"
37 + help_msg += "To access control net functions use:\n"
38 + help_msg += "\tpx cnet=net.controllers[0].net\n"
39 + help_msg += "\tpy cnet.METHOD\n"
40 + help_msg += "To send commands to each onos node, use: onos_all CMD\n"
41 + help_msg += "\nBy default, ONOS nodes are running on the 192.168.123.X network\n"
42 + info( "%s\n" % help_msg )
43 + # NOTE: If we use onos.py as a custom file and as an imported module,
44 + # we get two different sets of ONOS* classes. They don't play
45 + # together and things don't work properly. Specifically the
46 + # isinstance calls fail. This is due to import vs. exec calls
47 + onos.ONOSCLI( net )
48 +
49 +def cprint( msg, color="default"):
50 + color=color.lower()
51 + colors = { 'cyan': '\033[96m', 'purple': '\033[95m',
52 + 'blue': '\033[94m', 'green': '\033[92m',
53 + 'yellow': '\033[93m', 'red': '\033[91m',
54 + 'end': '\033[0m' }
55 + pre = colors.get( color, '' )
56 + output = pre + msg + colors['end']
57 + print( output )
58 +
59 +def getNode( net, nodeId=0 ):
60 + "Helper function: return ONOS node, defaults to the first node"
61 + return net.controllers[ 0 ].nodes()[ nodeId ]
62 +
63 +def onos_cli( net, line, nodeId=0 ):
64 + "Send command to ONOS CLI"
65 + c0 = net.controllers[ 0 ]
66 + # FIXME add this back after import onos.py works
67 + if isinstance( c0, onos.ONOSCluster ):
68 + # cmdLoop strips off command name 'onos'
69 + if line.startswith( ':' ):
70 + line = 'onos' + line
71 + node = getNode( net, nodeId )
72 + if line:
73 + line = '"%s"' % line
74 + cmd = 'client -h %s %s' % ( node.IP(), line )
75 + #node.cmdPrint( cmd )
76 + output = node.cmd( cmd )
77 + info( line )
78 + # Remove verbose spam from output
79 + m = re.search( "unverified \{\} key: \{\}", output )
80 + if m:
81 + info( output[m.end():] )
82 + else:
83 + info( output )
84 +
85 +def onos_all( net, line ):
86 + onosNodes = [ n for cluster in net.controllers for n in cluster.nodes() ]
87 + for node in range( len( onosNodes ) ):
88 + cprint( "*" * 53, "red" )
89 + cprint( "onos%s: %s" % ( str( node + 1 ), repr( onosNodes[ node ] ) ),
90 + "red" )
91 + cprint( "*" * 53, "red" )
92 + onos_cli( net, line, node )
93 +
94 +# FIXME This needs a better name
95 +def do_onos_all( self, line ):
96 + onos_all( self.mn, line)
97 +
98 +# Add custom cli commands
99 +# NOTE: This is so we can keep ONOSCLI and also add commands to it!
100 +origCLI.do_onos_all = do_onos_all
101 +
102 +# Test cases
103 +
104 +def Partition( net ):
105 + # Controller net instance
106 + cnet = net.controllers[0].net
107 +
108 + info( "ONOS control network partition test\n")
109 + net.pingAll()
110 + if args.interactive:
111 + pause( net, "~~~ Dropping into cli... Exit cli to continue test\n", True )
112 +
113 + onos_all( net, "nodes;partitions;partitions -c")
114 + info( "~~~ Right before the partitioned\n" )
115 + if args.interactive:
116 + pause( net, "Dropping into cli... Exit cli to continue test\n" )
117 +
118 + cs1 = cnet.switches[0]
119 + cs2 = cnet.switches[1]
120 +
121 + # PARTITION sub-clusters
122 +
123 + # we need to use names here
124 + cnet.configLinkStatus( cs1.name, cs2.name, "down" )
125 + onos_all( net, "nodes;partitions;partitions -c")
126 + info( "~~~ Right after cluster is partitioned. Next step is to heal the partition\n" )
127 + if args.interactive:
128 + pause( net, "Dropping into cli... Exit cli to continue test\n" )
129 +
130 + cnet.configLinkStatus( cs1.name, cs2.name, "up" )
131 + onos_all( net, "nodes;partitions;partitions -c")
132 + info( "~~~ Right after the partition is healed \n" )
133 + if args.interactive:
134 + pause( net, "Test is finished! Exit cli to exit test.\n" )
135 +
136 +def Scaling( net ):
137 +
138 + def startNodes( net, nodes ):
139 + "start multiple ONOS nodes"
140 + cluster = net.controllers[0]
141 + cluster.activeNodes.extend( nodes )
142 + cluster.activeNodes = sorted( set( cluster.activeNodes ) )
143 + for node in nodes:
144 + node.shouldStart = True
145 + node.start( cluster.env, cluster.activeNodes )
146 + for node in nodes:
147 + node.waitStarted()
148 +
149 + # control net objects
150 + cluster = net.controllers[0]
151 + cnet = cluster.net
152 + cs1 = cnet.switches[0]
153 +
154 + info( "ONOS dynamic clustering scaling test\n")
155 + # Start the first node
156 + cluster.activeNodes.append( cnet.hosts[0] )
157 + cluster.activeNodes = sorted( set( cluster.activeNodes ) )
158 + startNodes( net, cluster.activeNodes )
159 +
160 + onos_all( net, "nodes;partitions;partitions -c")
161 + if args.interactive:
162 + pause( net, "Dropping into cli... Exit cli to continue test\n" )
163 +
164 + # Scale up by two
165 + while True:
166 + new = [ n for c in net.controllers for n in c.net.hosts if isinstance( n, DynamicONOSNode) and not n.started ][:2]
167 + if not new:
168 + break
169 + startNodes( net, new )
170 + onos_all( net, "nodes;partitions;partitions -c")
171 + if args.interactive:
172 + pause( net, "Dropping into cli... Exit cli to continue test\n" )
173 +
174 + # Scale down
175 + for i in range( len( cluster.activeNodes ) - 1 ):
176 + node = cluster.activeNodes.pop()
177 + node.genPartitions( cluster.activeNodes, node.metadata )
178 + onos_all( net, "nodes;partitions;partitions -c")
179 + if args.interactive:
180 + pause( net, "Dropping into cli... Exit cli to continue test\n" )
181 + if args.interactive:
182 + pause( net, "Test is finished! Exit cli to exit test.\n" )
183 +
184 +
185 +
186 +# Mininet object subclasses
187 +
188 +class HTTP( Host ):
189 + def __init__( self, *args, **kwargs ):
190 + super( HTTP, self).__init__( *args, **kwargs )
191 + self.dir = '/tmp/%s' % self.name
192 + self.cmd( 'rm -rf', self.dir )
193 + self.cmd( 'mkdir', self.dir )
194 + self.cmd( 'cd', self.dir )
195 +
196 + def start( self ):
197 + output( "(starting HTTP Server)" )
198 + # start python web server as a bg process
199 + self.cmd( 'python -m SimpleHTTPServer &> web.log &' )
200 +
201 + def stop( self ):
202 + # XXX is this ever called?
203 + print "Stopping HTTP Server..."
204 + print self.cmd( 'fg' )
205 + print self.cmd( '\x03' ) # ctrl-c
206 +
207 +
208 +class DynamicONOSNode( onos.ONOSNode ):
209 + def __init__( self, *args, **kwargs ):
210 + self.shouldStart = False
211 + self.started = False
212 + self.metadata = '/tmp/cluster.json'
213 + super( DynamicONOSNode, self ).__init__( *args, **kwargs )
214 + # XXX HACK, need to get this passed in correctly
215 + self.alertAction = 'warn'
216 +
217 + def start( self, env, nodes=()):
218 + if not self.shouldStart:
219 + return
220 + elif self.started:
221 + return
222 + else:
223 + ##### Modified from base class
224 + env = dict( env )
225 + env.update( ONOS_HOME=self.ONOS_HOME )
226 + if self.remote:
227 + # Point onos to rewmote cluster metadata file
228 + ip = self.remote.get( 'ip', '127.0.0.1' )
229 + port = self.remote.get( 'port', '8000' )
230 + filename = self.remote.get( 'filename', 'cluster.json' )
231 + remote = 'http://%s:%s/%s' % ( ip, port, filename )
232 + uri = '-Donos.cluster.metadata.uri=%s' % remote
233 + prev = env.get( 'JAVA_OPTS', False )
234 + if prev:
235 + jarg = ':'.join( [prev, uri] )
236 + else:
237 + jarg = uri
238 + env.update( JAVA_OPTS=jarg )
239 + self.updateEnv( env )
240 + karafbin = glob( '%s/apache*/bin' % self.ONOS_HOME )[ 0 ]
241 + onosbin = join( self.ONOS_ROOT, 'tools/test/bin' )
242 + self.cmd( 'export PATH=%s:%s:$PATH' % ( onosbin, karafbin ) )
243 + self.cmd( 'cd', self.ONOS_HOME )
244 + self.ucmd( 'mkdir -p config ' )
245 + self.genPartitions( nodes, self.metadata )
246 + info( '(starting %s)' % self )
247 + service = join( self.ONOS_HOME, 'bin/onos-service' )
248 + self.ucmd( service, 'server 1>../onos.log 2>../onos.log'
249 + ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' )
250 + self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() )
251 + self.warningCount = 0
252 + ####
253 + self.started=True
254 +
255 + def sanityCheck( self, lowMem=100000 ):
256 + if self.started:
257 + super( DynamicONOSNode, self ).sanityCheck( lowMem )
258 +
259 + def waitStarted( self ):
260 + if self.started:
261 + super( DynamicONOSNode, self ).waitStarted()
262 +
263 + def genPartitions( self, nodes, location='/tmp/cluster.json' ):
264 + """
265 + Generate a cluster metadata file for dynamic clustering.
266 + Note: name should be the same in different versions of the file as
267 + well as the number of partitions.
268 + """
269 + def genParts( nodes, k, parts=3):
270 + l = deque( nodes )
271 + perms = []
272 + for i in range( 1, parts + 1 ):
273 + part = {
274 + 'id': i,
275 + 'members': list(l)[:k]
276 + }
277 + perms.append( part )
278 + l.rotate( -1 )
279 + return perms
280 +
281 + print "Generating %s with %s" % ( location, str(nodes) )
282 + port = 9876
283 + ips = [ node.IP() for node in nodes ]
284 + node = lambda k: { 'id': k, 'ip': k, 'port': port }
285 + m = hashlib.sha256( "Mininet based ONOS test" )
286 + name = int(m.hexdigest()[:8], base=16 )
287 + partitions = genParts( ips, 3 )
288 + data = {
289 + 'name': name,
290 + 'nodes': [ node(v) for v in ips ],
291 + 'partitions': partitions
292 + }
293 + output = json.dumps( data, indent=4 )
294 + with open( location, 'w' ) as f:
295 + f.write( output )
296 + cprint( output, "yellow" )
297 +
298 +
299 +class DynamicONOSCluster( onos.ONOSCluster ):
300 + def __init__( self, *args, **kwargs ):
301 + self.activeNodes = []
302 + # TODO: can we get super to use super's nodes()?
303 + super( DynamicONOSCluster, self ).__init__( *args, **kwargs )
304 + self.activeNodes = [ h for h in self.net.hosts if onos.isONOSNode( h ) ]
305 + onos.updateNodeIPs( self.env, self.nodes() )
306 + self.activeNodes = []
307 +
308 + def start( self ):
309 + "Start up ONOS control network"
310 + info( '*** ONOS_APPS = %s\n' % onos.ONOS_APPS )
311 + self.net.start()
312 + for node in self.net.hosts:
313 + if onos.isONOSNode( node ):
314 + node.start( self.env, self.nodes() )
315 + else:
316 + try:
317 + node.start()
318 + except AttributeError:
319 + # NAT doesn't have start?
320 + pass
321 + info( '\n' )
322 + self.configPortForwarding( ports=self.forward, action='A' )
323 + self.waitStarted()
324 + return
325 +
326 + def nodes( self ):
327 + "Return list of ONOS nodes that should be running"
328 + return self.activeNodes
329 +
330 +class HATopo( Topo ):
331 + def build( self, partitions=[], serverCount=1, dynamic=False, **kwargs ):
332 + """
333 + partitions = a list of strings specifing the assignment of onos nodes
334 + to regions. ['1', '2,3'] designates two regions, with
335 + ONOS 1 in the first and ONOS 2 and 3 in the second.
336 + serverCount = If partitions is not given, then the number of ONOS
337 + nodes to create
338 + dynamic = A boolean indicating dynamic ONOS clustering
339 + """
340 + self.switchNum = 1
341 + if dynamic:
342 + cls = DynamicONOSNode
343 + else:
344 + cls = onos.ONOSNode
345 + if partitions:
346 + prev = None
347 + for partition in partitions:
348 + # Create a region of ONOS nodes connected to a switch
349 + # FIXME Check for nodes that are not assigned to a partition?
350 + cur = self.addRegion( partition, cls )
351 +
352 + # Connect switch to previous switch
353 + if prev:
354 + self.addLink( prev, cur )
355 + prev = cur
356 + else:
357 + partition = ','.join( [ str( x ) for x in range( 1, serverCount + 1 ) ] )
358 + cs1 = self.addRegion( partition, cls )
359 + if dynamic:
360 + # TODO Pass these in
361 + scale = 2
362 + new = ','.join( [ str( x + 1 ) for x in range( serverCount , serverCount + scale ) ] )
363 + cs2 = self.addRegion( new, cls )
364 + self.addLink( cs1, cs2 )
365 + server = self.addHost( "server", cls=HTTP )
366 + for switch in self.switches():
367 + self.addLink( server, switch )
368 +
369 + def addRegion( self, partition, cls=onos.ONOSNode ):
370 + switch = self.addSwitch( 'cs%s' % self.switchNum )
371 + self.switchNum += 1
372 + for n in partition.split( ',' ):
373 + node = self.addHost( "onos" + str( n ), cls=cls )
374 + self.addLink( switch, node )
375 + return switch
376 +
377 +
378 +CLI = onos.ONOSCLI
379 +
380 +# The main runner
381 +def runTest( args ):
382 + test = None
383 + if args.test == "partition":
384 + test=Partition
385 + serverCount = args.nodes
386 + # NOTE we are ignoring serverCount for this test, using partition assignment instead.
387 + topo = HATopo( partitions=args.partition )
388 + # FIXME Configurable dataplane topology
389 + net = Mininet( topo=SingleSwitchTopo( 3 ),
390 + controller=[ onos.ONOSCluster( 'c0', topo=topo, alertAction='warn' ) ],
391 + switch=onos.ONOSOVSSwitch )
392 + elif args.test == "scaling":
393 + test=Scaling
394 + serverCount = args.nodes
395 + topo = HATopo( serverCount=serverCount, dynamic=True )
396 + net = Mininet( topo=SingleSwitchTopo( 3 ),
397 + controller=[ DynamicONOSCluster( 'c0', topo=topo, alertAction='warn' ) ],
398 + switch=onos.ONOSOVSSwitch )
399 + cluster = net.controllers[0]
400 + cnet = cluster.net
401 + server = cnet.get( 'server' )
402 + remote = { 'ip': server.IP(),
403 + 'port': '8000',
404 + 'filename':'cluster.json' }
405 + for node in cnet.hosts:
406 + if isinstance( node, DynamicONOSNode ):
407 + node.metadata = '%s/cluster.json' % server.dir
408 + node.remote = remote
409 + ips = []
410 + cluster.activeNodes = [ cnet.get( "onos%s" % ( i + 1 ) ) for i in range( serverCount ) ]
411 + for node in cluster.activeNodes:
412 + node.shouldStart = True
413 + else:
414 + print "Incorrect test"
415 + return
416 + net.start()
417 + if args.interactive:
418 + CLI( net )
419 + test(net)
420 + CLI( net )
421 + net.stop()
422 +
423 +
424 +if __name__ == '__main__':
425 + setLogLevel( 'info' )
426 + # Base parser
427 + parser= argparse.ArgumentParser(
428 + description='Mininet based HA tests for ONOS. For more detailed help on a test include the test option' )
429 + parser.add_argument(
430 + '-n', '--nodes', metavar="NODES", type=int, default=1,
431 + help="Number of nodes in the ONOS cluster" )
432 + parser.add_argument(
433 + '-i', '--interactive',# type=bool,
434 + default=False, action="store_true",
435 + help="Pause the test in between steps" )
436 + test_parsers=parser.add_subparsers( title="Tests", help="Types of HA tests", dest="test" )
437 +
438 + # Partition test parser
439 + partition_help = 'Network partition test. Each set of ONOS nodes is connected to their own switch in the control network. Partitions are introduced by removing links between control network switches.'
440 + partition_parser = test_parsers.add_parser(
441 + "partition", description=partition_help )
442 + partition_parser.add_argument(
443 + '-p', '--partition', metavar='Partition', required=True,
444 + type=str, nargs=2,
445 + help='Specify the membership for two partitions by node id. Nodes are comma separated and node count begins at 1. E.g. "1,3 2" will create a network with 3 ONOS nodes and two connected switches. Switch 1 will be connected to ONOS1 and ONOS3 while switch 2 will be connected to ONOS2. A partition will be created by disconnecting the two switches. All ONOS nodes will still be connected to the dataplane.' )
446 +
447 + # Dynamic scaling test parser
448 + # FIXME Replace with real values
449 + scaling_parser = test_parsers.add_parser( "scaling" )
450 +
451 + args = parser.parse_args()
452 + runTest( args )
...@@ -117,7 +117,8 @@ def updateNodeIPs( env, nodes ): ...@@ -117,7 +117,8 @@ def updateNodeIPs( env, nodes ):
117 for index, node in enumerate( nodes, 1 ): 117 for index, node in enumerate( nodes, 1 ):
118 var = 'OC%d' % index 118 var = 'OC%d' % index
119 env[ var ] = node.IP() 119 env[ var ] = node.IP()
120 - env[ 'OCI' ] = env[ 'OCN' ] = env[ 'OC1' ] 120 + if nodes:
121 + env[ 'OCI' ] = env[ 'OCN' ] = env[ 'OC1' ]
121 env[ 'ONOS_INSTANCES' ] = '\n'.join( 122 env[ 'ONOS_INSTANCES' ] = '\n'.join(
122 node.IP() for node in nodes ) 123 node.IP() for node in nodes )
123 environ.update( env ) 124 environ.update( env )
...@@ -242,6 +243,7 @@ class ONOSNode( Controller ): ...@@ -242,6 +243,7 @@ class ONOSNode( Controller ):
242 self.ONOS_HOME = '/tmp' 243 self.ONOS_HOME = '/tmp'
243 self.cmd( 'rm -rf', self.dir ) 244 self.cmd( 'rm -rf', self.dir )
244 self.ONOS_HOME = unpackONOS( self.dir, run=self.ucmd ) 245 self.ONOS_HOME = unpackONOS( self.dir, run=self.ucmd )
246 + self.ONOS_ROOT = ONOS_ROOT
245 247
246 # pylint: disable=arguments-differ 248 # pylint: disable=arguments-differ
247 249
......