Yuta HIGUCHI

Attempt to fix Link vanishing after mastership change

- LLDPLinkProvider: handle DeviceUpdate event
- DeviceManager: publish Device events caused by Mastership change
- DeviceManager: Always try to markOffLine on deviceDisconnected
- GossipDeviceStore: Silently ignore failure to get Timestamp on port events

Change-Id: I51fbb3f1924007867512f20e62d6d53090c63640
...@@ -221,9 +221,15 @@ public class DeviceManager ...@@ -221,9 +221,15 @@ public class DeviceManager
221 log.info("Device {} connected", deviceId); 221 log.info("Device {} connected", deviceId);
222 // check my Role 222 // check my Role
223 MastershipRole role = mastershipService.requestRoleFor(deviceId); 223 MastershipRole role = mastershipService.requestRoleFor(deviceId);
224 + log.info("requestedRole, became {} for {}", role, deviceId);
224 if (role != MastershipRole.MASTER) { 225 if (role != MastershipRole.MASTER) {
225 // TODO: Do we need to explicitly tell the Provider that 226 // TODO: Do we need to explicitly tell the Provider that
226 // this instance is no longer the MASTER? probably not 227 // this instance is no longer the MASTER? probably not
228 +// Device device = getDevice(deviceId);
229 +// if (device != null) {
230 +// // FIXME roleChanged should take DeviceId instead of Device
231 +// provider().roleChanged(device, role);
232 +// }
227 return; 233 return;
228 } 234 }
229 MastershipTerm term = mastershipService.requestTermService() 235 MastershipTerm term = mastershipService.requestTermService()
...@@ -231,6 +237,7 @@ public class DeviceManager ...@@ -231,6 +237,7 @@ public class DeviceManager
231 237
232 if (!term.master().equals(clusterService.getLocalNode().id())) { 238 if (!term.master().equals(clusterService.getLocalNode().id())) {
233 // lost mastership after requestRole told this instance was MASTER. 239 // lost mastership after requestRole told this instance was MASTER.
240 + log.info("lost mastership before getting term info.");
234 return; 241 return;
235 } 242 }
236 243
...@@ -251,17 +258,13 @@ public class DeviceManager ...@@ -251,17 +258,13 @@ public class DeviceManager
251 // instance is the new Master, but 258 // instance is the new Master, but
252 // event returned from the store is null? 259 // event returned from the store is null?
253 260
254 - // TODO: Confirm: Mastership could be lost after requestRole
255 - // and createOrUpdateDevice call.
256 - // In that case STANDBY node can
257 - // claim itself to be master against the Device.
258 - // Will the Node, chosen by the MastershipService, retry
259 - // to get the MASTER role when that happen?
260 -
261 // FIXME: 1st argument should be deviceId, to allow setting 261 // FIXME: 1st argument should be deviceId, to allow setting
262 // certain roles even if the store returned null. 262 // certain roles even if the store returned null.
263 + log.info("event: {} {}", event.type(), event);
263 provider().roleChanged(event.subject(), role); 264 provider().roleChanged(event.subject(), role);
264 post(event); 265 post(event);
266 + } else {
267 + log.info("No event to publish");
265 } 268 }
266 } 269 }
267 270
...@@ -270,32 +273,34 @@ public class DeviceManager ...@@ -270,32 +273,34 @@ public class DeviceManager
270 checkNotNull(deviceId, DEVICE_ID_NULL); 273 checkNotNull(deviceId, DEVICE_ID_NULL);
271 checkValidity(); 274 checkValidity();
272 275
273 - // FIXME: only the MASTER should be marking off-line in normal cases,
274 - // but if I was the last STANDBY connection, etc. and no one else
275 - // was there to mark the device offline, this instance may need to
276 - // temporarily request for Master Role and mark offline.
277 - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
278 - log.debug("Device {} disconnected, but I am not the master", deviceId);
279 - //let go of ability to be backup
280 - mastershipService.relinquishMastership(deviceId);
281 - return;
282 - }
283 276
284 DeviceEvent event = null; 277 DeviceEvent event = null;
285 try { 278 try {
286 event = store.markOffline(deviceId); 279 event = store.markOffline(deviceId);
287 } catch (IllegalStateException e) { 280 } catch (IllegalStateException e) {
281 + log.warn("Failed to mark {} offline", deviceId);
282 + // only the MASTER should be marking off-line in normal cases,
283 + // but if I was the last STANDBY connection, etc. and no one else
284 + // was there to mark the device offline, this instance may need to
285 + // temporarily request for Master Role and mark offline.
286 +
288 //there are times when this node will correctly have mastership, BUT 287 //there are times when this node will correctly have mastership, BUT
289 //that isn't reflected in the ClockManager before the device disconnects. 288 //that isn't reflected in the ClockManager before the device disconnects.
290 //we want to let go of the device anyways, so make sure this happens. 289 //we want to let go of the device anyways, so make sure this happens.
291 290
292 - // FIXME: Come up with workaround for above scenario. 291 + // FIXME: Store semantics leaking out as IllegalStateException.
292 + // Consider revising store API to handle this scenario.
293 +
294 + MastershipRole role = mastershipService.requestRoleFor(deviceId);
293 MastershipTerm term = termService.getMastershipTerm(deviceId); 295 MastershipTerm term = termService.getMastershipTerm(deviceId);
294 final NodeId myNodeId = clusterService.getLocalNode().id(); 296 final NodeId myNodeId = clusterService.getLocalNode().id();
295 // TODO: Move this type of check inside device clock manager, etc. 297 // TODO: Move this type of check inside device clock manager, etc.
296 if (myNodeId.equals(term.master())) { 298 if (myNodeId.equals(term.master())) {
299 + log.info("Marking {} offline", deviceId);
297 deviceClockProviderService.setMastershipTerm(deviceId, term); 300 deviceClockProviderService.setMastershipTerm(deviceId, term);
298 event = store.markOffline(deviceId); 301 event = store.markOffline(deviceId);
302 + } else {
303 + log.error("Failed again marking {} offline. {}", deviceId, role);
299 } 304 }
300 } finally { 305 } finally {
301 //relinquish master role and ability to be backup. 306 //relinquish master role and ability to be backup.
...@@ -315,14 +320,6 @@ public class DeviceManager ...@@ -315,14 +320,6 @@ public class DeviceManager
315 checkNotNull(portDescriptions, 320 checkNotNull(portDescriptions,
316 "Port descriptions list cannot be null"); 321 "Port descriptions list cannot be null");
317 checkValidity(); 322 checkValidity();
318 - //XXX what's this doing here?
319 - this.provider().id();
320 -
321 - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
322 - // TODO If we become master, then we'll trigger something to update this
323 - // info to fix any inconsistencies that may result during the handoff.
324 - return;
325 - }
326 323
327 List<DeviceEvent> events = store.updatePorts(this.provider().id(), 324 List<DeviceEvent> events = store.updatePorts(this.provider().id(),
328 deviceId, portDescriptions); 325 deviceId, portDescriptions);
...@@ -338,13 +335,8 @@ public class DeviceManager ...@@ -338,13 +335,8 @@ public class DeviceManager
338 checkNotNull(portDescription, PORT_DESCRIPTION_NULL); 335 checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
339 checkValidity(); 336 checkValidity();
340 337
341 - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { 338 + final DeviceEvent event = store.updatePortStatus(this.provider().id(),
342 - // TODO If we become master, then we'll trigger something to update this 339 + deviceId, portDescription);
343 - // info to fix any inconsistencies that may result during the handoff.
344 - return;
345 - }
346 - DeviceEvent event = store.updatePortStatus(this.provider().id(),
347 - deviceId, portDescription);
348 if (event != null) { 340 if (event != null) {
349 log.info("Device {} port {} status changed", deviceId, event 341 log.info("Device {} port {} status changed", deviceId, event
350 .port().number()); 342 .port().number());
...@@ -407,15 +399,16 @@ public class DeviceManager ...@@ -407,15 +399,16 @@ public class DeviceManager
407 Device device = getDevice(did); 399 Device device = getDevice(did);
408 if ((device != null) && !isAvailable(did)) { 400 if ((device != null) && !isAvailable(did)) {
409 //flag the device as online. Is there a better way to do this? 401 //flag the device as online. Is there a better way to do this?
410 - store.createOrUpdateDevice(device.providerId(), did, 402 + DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did,
411 new DefaultDeviceDescription( 403 new DefaultDeviceDescription(
412 did.uri(), device.type(), device.manufacturer(), 404 did.uri(), device.type(), device.manufacturer(),
413 device.hwVersion(), device.swVersion(), 405 device.hwVersion(), device.swVersion(),
414 device.serialNumber(), device.chassisId())); 406 device.serialNumber(), device.chassisId()));
407 + post(devEvent);
415 } 408 }
416 - //TODO re-collect device information to fix potential staleness
417 - queryPortInfo(did);
418 applyRole(did, MastershipRole.MASTER); 409 applyRole(did, MastershipRole.MASTER);
410 + // re-collect device information to fix potential staleness
411 + queryPortInfo(did);
419 } else if (event.roleInfo().backups().contains(myNodeId)) { 412 } else if (event.roleInfo().backups().contains(myNodeId)) {
420 applyRole(did, MastershipRole.STANDBY); 413 applyRole(did, MastershipRole.STANDBY);
421 } 414 }
......
...@@ -328,8 +328,8 @@ public class GossipDeviceStore ...@@ -328,8 +328,8 @@ public class GossipDeviceStore
328 final Timestamp timestamp = deviceClockService.getTimestamp(deviceId); 328 final Timestamp timestamp = deviceClockService.getTimestamp(deviceId);
329 final DeviceEvent event = markOfflineInternal(deviceId, timestamp); 329 final DeviceEvent event = markOfflineInternal(deviceId, timestamp);
330 if (event != null) { 330 if (event != null) {
331 - log.info("Notifying peers of a device offline topology event for deviceId: {}", 331 + log.info("Notifying peers of a device offline topology event for deviceId: {} {}",
332 - deviceId); 332 + deviceId, timestamp);
333 try { 333 try {
334 notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp)); 334 notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp));
335 } catch (IOException e) { 335 } catch (IOException e) {
...@@ -399,7 +399,24 @@ public class GossipDeviceStore ...@@ -399,7 +399,24 @@ public class GossipDeviceStore
399 DeviceId deviceId, 399 DeviceId deviceId,
400 List<PortDescription> portDescriptions) { 400 List<PortDescription> portDescriptions) {
401 401
402 - final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); 402 + final Timestamp newTimestamp;
403 + try {
404 + newTimestamp = deviceClockService.getTimestamp(deviceId);
405 + } catch (IllegalStateException e) {
406 + log.info("Timestamp was not available for device {}", deviceId);
407 + log.debug(" discarding {}", portDescriptions);
408 + // Failed to generate timestamp.
409 +
410 + // Possible situation:
411 + // Device connected and became master for short period of time,
412 + // but lost mastership before this instance had the chance to
413 + // retrieve term information.
414 +
415 + // Information dropped here is expected to be recoverable by
416 + // device probing after mastership change
417 +
418 + return Collections.emptyList();
419 + }
403 log.info("timestamp for {} {}", deviceId, newTimestamp); 420 log.info("timestamp for {} {}", deviceId, newTimestamp);
404 421
405 final Timestamped<List<PortDescription>> timestampedInput 422 final Timestamped<List<PortDescription>> timestampedInput
...@@ -580,7 +597,16 @@ public class GossipDeviceStore ...@@ -580,7 +597,16 @@ public class GossipDeviceStore
580 DeviceId deviceId, 597 DeviceId deviceId,
581 PortDescription portDescription) { 598 PortDescription portDescription) {
582 599
583 - final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); 600 + final Timestamp newTimestamp;
601 + try {
602 + newTimestamp = deviceClockService.getTimestamp(deviceId);
603 + } catch (IllegalStateException e) {
604 + log.info("Timestamp was not available for device {}", deviceId);
605 + log.debug(" discarding {}", portDescription);
606 + // Failed to generate timestamp. Ignoring.
607 + // See updatePorts comment
608 + return null;
609 + }
584 final Timestamped<PortDescription> deltaDesc 610 final Timestamped<PortDescription> deltaDesc
585 = new Timestamped<>(portDescription, newTimestamp); 611 = new Timestamped<>(portDescription, newTimestamp);
586 final DeviceEvent event; 612 final DeviceEvent event;
......
...@@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { ...@@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
111 log.error("Device is null."); 111 log.error("Device is null.");
112 return; 112 return;
113 } 113 }
114 + log.trace("{} {} {}", event.type(), event.subject(), event);
114 switch (event.type()) { 115 switch (event.type()) {
115 case DEVICE_ADDED: 116 case DEVICE_ADDED:
116 - discoverers.put(device.id(), 117 + case DEVICE_UPDATED:
117 - new LinkDiscovery(device, packetSevice, masterService, 118 + ld = discoverers.get(device.id());
118 - providerService, useBDDP)); 119 + if (ld == null) {
120 + log.debug("Device added ({}) {}", event.type(), device.id());
121 + discoverers.put(device.id(),
122 + new LinkDiscovery(device, packetSevice, masterService,
123 + providerService, useBDDP));
124 + } else {
125 + if (ld.isStopped()) {
126 + log.debug("Device restarted ({}) {}", event.type(), device.id());
127 + ld.start();
128 + }
129 + }
119 break; 130 break;
120 case PORT_ADDED: 131 case PORT_ADDED:
121 case PORT_UPDATED: 132 case PORT_UPDATED:
...@@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { ...@@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
125 return; 136 return;
126 } 137 }
127 if (!port.number().isLogical()) { 138 if (!port.number().isLogical()) {
139 + log.debug("Port added {}", port);
128 ld.addPort(port); 140 ld.addPort(port);
129 } 141 }
130 } else { 142 } else {
...@@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { ...@@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
134 } 146 }
135 break; 147 break;
136 case PORT_REMOVED: 148 case PORT_REMOVED:
149 + log.debug("Port removed {}", port);
137 ConnectPoint point = new ConnectPoint(device.id(), 150 ConnectPoint point = new ConnectPoint(device.id(),
138 port.number()); 151 port.number());
139 providerService.linksVanished(point); 152 providerService.linksVanished(point);
153 + // TODO: Don't we need to removePort from ld?
140 break; 154 break;
141 case DEVICE_REMOVED: 155 case DEVICE_REMOVED:
142 case DEVICE_SUSPENDED: 156 case DEVICE_SUSPENDED:
157 + log.debug("Device removed {}", device.id());
143 ld = discoverers.get(device.id()); 158 ld = discoverers.get(device.id());
144 if (ld == null) { 159 if (ld == null) {
145 return; 160 return;
...@@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { ...@@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
153 return; 168 return;
154 } 169 }
155 if (deviceService.isAvailable(device.id())) { 170 if (deviceService.isAvailable(device.id())) {
171 + log.debug("Device up {}", device.id());
156 ld.start(); 172 ld.start();
157 } else { 173 } else {
158 providerService.linksVanished(device.id()); 174 providerService.linksVanished(device.id());
175 + log.debug("Device down {}", device.id());
159 ld.stop(); 176 ld.stop();
160 } 177 }
161 break; 178 break;
162 - case DEVICE_UPDATED:
163 case DEVICE_MASTERSHIP_CHANGED: 179 case DEVICE_MASTERSHIP_CHANGED:
164 if (!discoverers.containsKey(device.id())) { 180 if (!discoverers.containsKey(device.id())) {
181 + // TODO: ideally, should never reach here
182 + log.debug("Device mastership changed ({}) {}", event.type(), device.id());
165 discoverers.put(device.id(), 183 discoverers.put(device.id(),
166 new LinkDiscovery(device, packetSevice, masterService, 184 new LinkDiscovery(device, packetSevice, masterService,
167 providerService, useBDDP)); 185 providerService, useBDDP));
......
...@@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask { ...@@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask {
139 * @param port the port 139 * @param port the port
140 */ 140 */
141 public void addPort(final Port port) { 141 public void addPort(final Port port) {
142 - this.log.debug("sending init probe to port {}", 142 + this.log.debug("sending init probe to port {}@{}",
143 - port.number().toLong()); 143 + port.number().toLong(), device.id());
144 144
145 sendProbes(port.number().toLong()); 145 sendProbes(port.number().toLong());
146 146
...@@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask { ...@@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask {
245 */ 245 */
246 @Override 246 @Override
247 public void run(final Timeout t) { 247 public void run(final Timeout t) {
248 - this.log.trace("sending probes"); 248 + this.log.trace("sending probes from {}", device.id());
249 synchronized (this) { 249 synchronized (this) {
250 final Iterator<Long> fastIterator = this.fastPorts.iterator(); 250 final Iterator<Long> fastIterator = this.fastPorts.iterator();
251 Long portNumber; 251 Long portNumber;
...@@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask { ...@@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask {
256 .getAndIncrement(); 256 .getAndIncrement();
257 257
258 if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) { 258 if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) {
259 - this.log.trace("sending fast probe to port"); 259 + this.log.trace("sending fast probe to port {}", portNumber);
260 sendProbes(portNumber); 260 sendProbes(portNumber);
261 } else { 261 } else {
262 // Update fast and slow ports 262 // Update fast and slow ports
...@@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask { ...@@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask {
356 if (device.type() != Device.Type.ROADM && 356 if (device.type() != Device.Type.ROADM &&
357 mastershipService.getLocalRole(this.device.id()) == 357 mastershipService.getLocalRole(this.device.id()) ==
358 MastershipRole.MASTER) { 358 MastershipRole.MASTER) {
359 + log.debug("sending probes out to {}@{}", portNumber, device.id());
359 OutboundPacket pkt = this.createOutBoundLLDP(portNumber); 360 OutboundPacket pkt = this.createOutBoundLLDP(portNumber);
360 pktService.emit(pkt); 361 pktService.emit(pkt);
361 if (useBDDP) { 362 if (useBDDP) {
......
...@@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr ...@@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr
96 // FIXME if possible, we might want this to be part of 96 // FIXME if possible, we might want this to be part of
97 // OpenFlowSwitch interface so the driver interface isn't misused. 97 // OpenFlowSwitch interface so the driver interface isn't misused.
98 OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri())); 98 OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri()));
99 - if (!((OpenFlowSwitchDriver) sw).isConnected()) { 99 + if (sw == null ||
100 + !((OpenFlowSwitchDriver) sw).isConnected()) {
101 + LOG.error("Failed to probe device {} on sw={}", device, sw);
100 providerService.deviceDisconnected(device.id()); 102 providerService.deviceDisconnected(device.id());
101 return; 103 return;
102 } 104 }
......