Attempt to fix Link vanishing after mastership change
- LLDPLinkProvider: handle DeviceUpdate event - DeviceManager: publish Device events caused by Mastership change - DeviceManager: Always try to markOffLine on deviceDisconnected - GossipDeviceStore: Silently ignore failure to get Timestamp on port events Change-Id: I51fbb3f1924007867512f20e62d6d53090c63640
Showing
5 changed files
with
89 additions
and
49 deletions
... | @@ -221,9 +221,15 @@ public class DeviceManager | ... | @@ -221,9 +221,15 @@ public class DeviceManager |
221 | log.info("Device {} connected", deviceId); | 221 | log.info("Device {} connected", deviceId); |
222 | // check my Role | 222 | // check my Role |
223 | MastershipRole role = mastershipService.requestRoleFor(deviceId); | 223 | MastershipRole role = mastershipService.requestRoleFor(deviceId); |
224 | + log.info("requestedRole, became {} for {}", role, deviceId); | ||
224 | if (role != MastershipRole.MASTER) { | 225 | if (role != MastershipRole.MASTER) { |
225 | // TODO: Do we need to explicitly tell the Provider that | 226 | // TODO: Do we need to explicitly tell the Provider that |
226 | // this instance is no longer the MASTER? probably not | 227 | // this instance is no longer the MASTER? probably not |
228 | +// Device device = getDevice(deviceId); | ||
229 | +// if (device != null) { | ||
230 | +// // FIXME roleChanged should take DeviceId instead of Device | ||
231 | +// provider().roleChanged(device, role); | ||
232 | +// } | ||
227 | return; | 233 | return; |
228 | } | 234 | } |
229 | MastershipTerm term = mastershipService.requestTermService() | 235 | MastershipTerm term = mastershipService.requestTermService() |
... | @@ -231,6 +237,7 @@ public class DeviceManager | ... | @@ -231,6 +237,7 @@ public class DeviceManager |
231 | 237 | ||
232 | if (!term.master().equals(clusterService.getLocalNode().id())) { | 238 | if (!term.master().equals(clusterService.getLocalNode().id())) { |
233 | // lost mastership after requestRole told this instance was MASTER. | 239 | // lost mastership after requestRole told this instance was MASTER. |
240 | + log.info("lost mastership before getting term info."); | ||
234 | return; | 241 | return; |
235 | } | 242 | } |
236 | 243 | ||
... | @@ -251,17 +258,13 @@ public class DeviceManager | ... | @@ -251,17 +258,13 @@ public class DeviceManager |
251 | // instance is the new Master, but | 258 | // instance is the new Master, but |
252 | // event returned from the store is null? | 259 | // event returned from the store is null? |
253 | 260 | ||
254 | - // TODO: Confirm: Mastership could be lost after requestRole | ||
255 | - // and createOrUpdateDevice call. | ||
256 | - // In that case STANDBY node can | ||
257 | - // claim itself to be master against the Device. | ||
258 | - // Will the Node, chosen by the MastershipService, retry | ||
259 | - // to get the MASTER role when that happen? | ||
260 | - | ||
261 | // FIXME: 1st argument should be deviceId, to allow setting | 261 | // FIXME: 1st argument should be deviceId, to allow setting |
262 | // certain roles even if the store returned null. | 262 | // certain roles even if the store returned null. |
263 | + log.info("event: {} {}", event.type(), event); | ||
263 | provider().roleChanged(event.subject(), role); | 264 | provider().roleChanged(event.subject(), role); |
264 | post(event); | 265 | post(event); |
266 | + } else { | ||
267 | + log.info("No event to publish"); | ||
265 | } | 268 | } |
266 | } | 269 | } |
267 | 270 | ||
... | @@ -270,32 +273,34 @@ public class DeviceManager | ... | @@ -270,32 +273,34 @@ public class DeviceManager |
270 | checkNotNull(deviceId, DEVICE_ID_NULL); | 273 | checkNotNull(deviceId, DEVICE_ID_NULL); |
271 | checkValidity(); | 274 | checkValidity(); |
272 | 275 | ||
273 | - // FIXME: only the MASTER should be marking off-line in normal cases, | ||
274 | - // but if I was the last STANDBY connection, etc. and no one else | ||
275 | - // was there to mark the device offline, this instance may need to | ||
276 | - // temporarily request for Master Role and mark offline. | ||
277 | - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { | ||
278 | - log.debug("Device {} disconnected, but I am not the master", deviceId); | ||
279 | - //let go of ability to be backup | ||
280 | - mastershipService.relinquishMastership(deviceId); | ||
281 | - return; | ||
282 | - } | ||
283 | 276 | ||
284 | DeviceEvent event = null; | 277 | DeviceEvent event = null; |
285 | try { | 278 | try { |
286 | event = store.markOffline(deviceId); | 279 | event = store.markOffline(deviceId); |
287 | } catch (IllegalStateException e) { | 280 | } catch (IllegalStateException e) { |
281 | + log.warn("Failed to mark {} offline", deviceId); | ||
282 | + // only the MASTER should be marking off-line in normal cases, | ||
283 | + // but if I was the last STANDBY connection, etc. and no one else | ||
284 | + // was there to mark the device offline, this instance may need to | ||
285 | + // temporarily request for Master Role and mark offline. | ||
286 | + | ||
288 | //there are times when this node will correctly have mastership, BUT | 287 | //there are times when this node will correctly have mastership, BUT |
289 | //that isn't reflected in the ClockManager before the device disconnects. | 288 | //that isn't reflected in the ClockManager before the device disconnects. |
290 | //we want to let go of the device anyways, so make sure this happens. | 289 | //we want to let go of the device anyways, so make sure this happens. |
291 | 290 | ||
292 | - // FIXME: Come up with workaround for above scenario. | 291 | + // FIXME: Store semantics leaking out as IllegalStateException. |
292 | + // Consider revising store API to handle this scenario. | ||
293 | + | ||
294 | + MastershipRole role = mastershipService.requestRoleFor(deviceId); | ||
293 | MastershipTerm term = termService.getMastershipTerm(deviceId); | 295 | MastershipTerm term = termService.getMastershipTerm(deviceId); |
294 | final NodeId myNodeId = clusterService.getLocalNode().id(); | 296 | final NodeId myNodeId = clusterService.getLocalNode().id(); |
295 | // TODO: Move this type of check inside device clock manager, etc. | 297 | // TODO: Move this type of check inside device clock manager, etc. |
296 | if (myNodeId.equals(term.master())) { | 298 | if (myNodeId.equals(term.master())) { |
299 | + log.info("Marking {} offline", deviceId); | ||
297 | deviceClockProviderService.setMastershipTerm(deviceId, term); | 300 | deviceClockProviderService.setMastershipTerm(deviceId, term); |
298 | event = store.markOffline(deviceId); | 301 | event = store.markOffline(deviceId); |
302 | + } else { | ||
303 | + log.error("Failed again marking {} offline. {}", deviceId, role); | ||
299 | } | 304 | } |
300 | } finally { | 305 | } finally { |
301 | //relinquish master role and ability to be backup. | 306 | //relinquish master role and ability to be backup. |
... | @@ -315,14 +320,6 @@ public class DeviceManager | ... | @@ -315,14 +320,6 @@ public class DeviceManager |
315 | checkNotNull(portDescriptions, | 320 | checkNotNull(portDescriptions, |
316 | "Port descriptions list cannot be null"); | 321 | "Port descriptions list cannot be null"); |
317 | checkValidity(); | 322 | checkValidity(); |
318 | - //XXX what's this doing here? | ||
319 | - this.provider().id(); | ||
320 | - | ||
321 | - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { | ||
322 | - // TODO If we become master, then we'll trigger something to update this | ||
323 | - // info to fix any inconsistencies that may result during the handoff. | ||
324 | - return; | ||
325 | - } | ||
326 | 323 | ||
327 | List<DeviceEvent> events = store.updatePorts(this.provider().id(), | 324 | List<DeviceEvent> events = store.updatePorts(this.provider().id(), |
328 | deviceId, portDescriptions); | 325 | deviceId, portDescriptions); |
... | @@ -338,13 +335,8 @@ public class DeviceManager | ... | @@ -338,13 +335,8 @@ public class DeviceManager |
338 | checkNotNull(portDescription, PORT_DESCRIPTION_NULL); | 335 | checkNotNull(portDescription, PORT_DESCRIPTION_NULL); |
339 | checkValidity(); | 336 | checkValidity(); |
340 | 337 | ||
341 | - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { | 338 | + final DeviceEvent event = store.updatePortStatus(this.provider().id(), |
342 | - // TODO If we become master, then we'll trigger something to update this | 339 | + deviceId, portDescription); |
343 | - // info to fix any inconsistencies that may result during the handoff. | ||
344 | - return; | ||
345 | - } | ||
346 | - DeviceEvent event = store.updatePortStatus(this.provider().id(), | ||
347 | - deviceId, portDescription); | ||
348 | if (event != null) { | 340 | if (event != null) { |
349 | log.info("Device {} port {} status changed", deviceId, event | 341 | log.info("Device {} port {} status changed", deviceId, event |
350 | .port().number()); | 342 | .port().number()); |
... | @@ -407,15 +399,16 @@ public class DeviceManager | ... | @@ -407,15 +399,16 @@ public class DeviceManager |
407 | Device device = getDevice(did); | 399 | Device device = getDevice(did); |
408 | if ((device != null) && !isAvailable(did)) { | 400 | if ((device != null) && !isAvailable(did)) { |
409 | //flag the device as online. Is there a better way to do this? | 401 | //flag the device as online. Is there a better way to do this? |
410 | - store.createOrUpdateDevice(device.providerId(), did, | 402 | + DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did, |
411 | new DefaultDeviceDescription( | 403 | new DefaultDeviceDescription( |
412 | did.uri(), device.type(), device.manufacturer(), | 404 | did.uri(), device.type(), device.manufacturer(), |
413 | device.hwVersion(), device.swVersion(), | 405 | device.hwVersion(), device.swVersion(), |
414 | device.serialNumber(), device.chassisId())); | 406 | device.serialNumber(), device.chassisId())); |
407 | + post(devEvent); | ||
415 | } | 408 | } |
416 | - //TODO re-collect device information to fix potential staleness | ||
417 | - queryPortInfo(did); | ||
418 | applyRole(did, MastershipRole.MASTER); | 409 | applyRole(did, MastershipRole.MASTER); |
410 | + // re-collect device information to fix potential staleness | ||
411 | + queryPortInfo(did); | ||
419 | } else if (event.roleInfo().backups().contains(myNodeId)) { | 412 | } else if (event.roleInfo().backups().contains(myNodeId)) { |
420 | applyRole(did, MastershipRole.STANDBY); | 413 | applyRole(did, MastershipRole.STANDBY); |
421 | } | 414 | } | ... | ... |
... | @@ -328,8 +328,8 @@ public class GossipDeviceStore | ... | @@ -328,8 +328,8 @@ public class GossipDeviceStore |
328 | final Timestamp timestamp = deviceClockService.getTimestamp(deviceId); | 328 | final Timestamp timestamp = deviceClockService.getTimestamp(deviceId); |
329 | final DeviceEvent event = markOfflineInternal(deviceId, timestamp); | 329 | final DeviceEvent event = markOfflineInternal(deviceId, timestamp); |
330 | if (event != null) { | 330 | if (event != null) { |
331 | - log.info("Notifying peers of a device offline topology event for deviceId: {}", | 331 | + log.info("Notifying peers of a device offline topology event for deviceId: {} {}", |
332 | - deviceId); | 332 | + deviceId, timestamp); |
333 | try { | 333 | try { |
334 | notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp)); | 334 | notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp)); |
335 | } catch (IOException e) { | 335 | } catch (IOException e) { |
... | @@ -399,7 +399,24 @@ public class GossipDeviceStore | ... | @@ -399,7 +399,24 @@ public class GossipDeviceStore |
399 | DeviceId deviceId, | 399 | DeviceId deviceId, |
400 | List<PortDescription> portDescriptions) { | 400 | List<PortDescription> portDescriptions) { |
401 | 401 | ||
402 | - final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); | 402 | + final Timestamp newTimestamp; |
403 | + try { | ||
404 | + newTimestamp = deviceClockService.getTimestamp(deviceId); | ||
405 | + } catch (IllegalStateException e) { | ||
406 | + log.info("Timestamp was not available for device {}", deviceId); | ||
407 | + log.debug(" discarding {}", portDescriptions); | ||
408 | + // Failed to generate timestamp. | ||
409 | + | ||
410 | + // Possible situation: | ||
411 | + // Device connected and became master for short period of time, | ||
412 | + // but lost mastership before this instance had the chance to | ||
413 | + // retrieve term information. | ||
414 | + | ||
415 | + // Information dropped here is expected to be recoverable by | ||
416 | + // device probing after mastership change | ||
417 | + | ||
418 | + return Collections.emptyList(); | ||
419 | + } | ||
403 | log.info("timestamp for {} {}", deviceId, newTimestamp); | 420 | log.info("timestamp for {} {}", deviceId, newTimestamp); |
404 | 421 | ||
405 | final Timestamped<List<PortDescription>> timestampedInput | 422 | final Timestamped<List<PortDescription>> timestampedInput |
... | @@ -580,7 +597,16 @@ public class GossipDeviceStore | ... | @@ -580,7 +597,16 @@ public class GossipDeviceStore |
580 | DeviceId deviceId, | 597 | DeviceId deviceId, |
581 | PortDescription portDescription) { | 598 | PortDescription portDescription) { |
582 | 599 | ||
583 | - final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); | 600 | + final Timestamp newTimestamp; |
601 | + try { | ||
602 | + newTimestamp = deviceClockService.getTimestamp(deviceId); | ||
603 | + } catch (IllegalStateException e) { | ||
604 | + log.info("Timestamp was not available for device {}", deviceId); | ||
605 | + log.debug(" discarding {}", portDescription); | ||
606 | + // Failed to generate timestamp. Ignoring. | ||
607 | + // See updatePorts comment | ||
608 | + return null; | ||
609 | + } | ||
584 | final Timestamped<PortDescription> deltaDesc | 610 | final Timestamped<PortDescription> deltaDesc |
585 | = new Timestamped<>(portDescription, newTimestamp); | 611 | = new Timestamped<>(portDescription, newTimestamp); |
586 | final DeviceEvent event; | 612 | final DeviceEvent event; | ... | ... |
... | @@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { | ... | @@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { |
111 | log.error("Device is null."); | 111 | log.error("Device is null."); |
112 | return; | 112 | return; |
113 | } | 113 | } |
114 | + log.trace("{} {} {}", event.type(), event.subject(), event); | ||
114 | switch (event.type()) { | 115 | switch (event.type()) { |
115 | case DEVICE_ADDED: | 116 | case DEVICE_ADDED: |
116 | - discoverers.put(device.id(), | 117 | + case DEVICE_UPDATED: |
117 | - new LinkDiscovery(device, packetSevice, masterService, | 118 | + ld = discoverers.get(device.id()); |
118 | - providerService, useBDDP)); | 119 | + if (ld == null) { |
120 | + log.debug("Device added ({}) {}", event.type(), device.id()); | ||
121 | + discoverers.put(device.id(), | ||
122 | + new LinkDiscovery(device, packetSevice, masterService, | ||
123 | + providerService, useBDDP)); | ||
124 | + } else { | ||
125 | + if (ld.isStopped()) { | ||
126 | + log.debug("Device restarted ({}) {}", event.type(), device.id()); | ||
127 | + ld.start(); | ||
128 | + } | ||
129 | + } | ||
119 | break; | 130 | break; |
120 | case PORT_ADDED: | 131 | case PORT_ADDED: |
121 | case PORT_UPDATED: | 132 | case PORT_UPDATED: |
... | @@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { | ... | @@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { |
125 | return; | 136 | return; |
126 | } | 137 | } |
127 | if (!port.number().isLogical()) { | 138 | if (!port.number().isLogical()) { |
139 | + log.debug("Port added {}", port); | ||
128 | ld.addPort(port); | 140 | ld.addPort(port); |
129 | } | 141 | } |
130 | } else { | 142 | } else { |
... | @@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { | ... | @@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { |
134 | } | 146 | } |
135 | break; | 147 | break; |
136 | case PORT_REMOVED: | 148 | case PORT_REMOVED: |
149 | + log.debug("Port removed {}", port); | ||
137 | ConnectPoint point = new ConnectPoint(device.id(), | 150 | ConnectPoint point = new ConnectPoint(device.id(), |
138 | port.number()); | 151 | port.number()); |
139 | providerService.linksVanished(point); | 152 | providerService.linksVanished(point); |
153 | + // TODO: Don't we need to removePort from ld? | ||
140 | break; | 154 | break; |
141 | case DEVICE_REMOVED: | 155 | case DEVICE_REMOVED: |
142 | case DEVICE_SUSPENDED: | 156 | case DEVICE_SUSPENDED: |
157 | + log.debug("Device removed {}", device.id()); | ||
143 | ld = discoverers.get(device.id()); | 158 | ld = discoverers.get(device.id()); |
144 | if (ld == null) { | 159 | if (ld == null) { |
145 | return; | 160 | return; |
... | @@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { | ... | @@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { |
153 | return; | 168 | return; |
154 | } | 169 | } |
155 | if (deviceService.isAvailable(device.id())) { | 170 | if (deviceService.isAvailable(device.id())) { |
171 | + log.debug("Device up {}", device.id()); | ||
156 | ld.start(); | 172 | ld.start(); |
157 | } else { | 173 | } else { |
158 | providerService.linksVanished(device.id()); | 174 | providerService.linksVanished(device.id()); |
175 | + log.debug("Device down {}", device.id()); | ||
159 | ld.stop(); | 176 | ld.stop(); |
160 | } | 177 | } |
161 | break; | 178 | break; |
162 | - case DEVICE_UPDATED: | ||
163 | case DEVICE_MASTERSHIP_CHANGED: | 179 | case DEVICE_MASTERSHIP_CHANGED: |
164 | if (!discoverers.containsKey(device.id())) { | 180 | if (!discoverers.containsKey(device.id())) { |
181 | + // TODO: ideally, should never reach here | ||
182 | + log.debug("Device mastership changed ({}) {}", event.type(), device.id()); | ||
165 | discoverers.put(device.id(), | 183 | discoverers.put(device.id(), |
166 | new LinkDiscovery(device, packetSevice, masterService, | 184 | new LinkDiscovery(device, packetSevice, masterService, |
167 | providerService, useBDDP)); | 185 | providerService, useBDDP)); | ... | ... |
... | @@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask { | ... | @@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask { |
139 | * @param port the port | 139 | * @param port the port |
140 | */ | 140 | */ |
141 | public void addPort(final Port port) { | 141 | public void addPort(final Port port) { |
142 | - this.log.debug("sending init probe to port {}", | 142 | + this.log.debug("sending init probe to port {}@{}", |
143 | - port.number().toLong()); | 143 | + port.number().toLong(), device.id()); |
144 | 144 | ||
145 | sendProbes(port.number().toLong()); | 145 | sendProbes(port.number().toLong()); |
146 | 146 | ||
... | @@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask { | ... | @@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask { |
245 | */ | 245 | */ |
246 | @Override | 246 | @Override |
247 | public void run(final Timeout t) { | 247 | public void run(final Timeout t) { |
248 | - this.log.trace("sending probes"); | 248 | + this.log.trace("sending probes from {}", device.id()); |
249 | synchronized (this) { | 249 | synchronized (this) { |
250 | final Iterator<Long> fastIterator = this.fastPorts.iterator(); | 250 | final Iterator<Long> fastIterator = this.fastPorts.iterator(); |
251 | Long portNumber; | 251 | Long portNumber; |
... | @@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask { | ... | @@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask { |
256 | .getAndIncrement(); | 256 | .getAndIncrement(); |
257 | 257 | ||
258 | if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) { | 258 | if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) { |
259 | - this.log.trace("sending fast probe to port"); | 259 | + this.log.trace("sending fast probe to port {}", portNumber); |
260 | sendProbes(portNumber); | 260 | sendProbes(portNumber); |
261 | } else { | 261 | } else { |
262 | // Update fast and slow ports | 262 | // Update fast and slow ports |
... | @@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask { | ... | @@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask { |
356 | if (device.type() != Device.Type.ROADM && | 356 | if (device.type() != Device.Type.ROADM && |
357 | mastershipService.getLocalRole(this.device.id()) == | 357 | mastershipService.getLocalRole(this.device.id()) == |
358 | MastershipRole.MASTER) { | 358 | MastershipRole.MASTER) { |
359 | + log.debug("sending probes out to {}@{}", portNumber, device.id()); | ||
359 | OutboundPacket pkt = this.createOutBoundLLDP(portNumber); | 360 | OutboundPacket pkt = this.createOutBoundLLDP(portNumber); |
360 | pktService.emit(pkt); | 361 | pktService.emit(pkt); |
361 | if (useBDDP) { | 362 | if (useBDDP) { | ... | ... |
... | @@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr | ... | @@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr |
96 | // FIXME if possible, we might want this to be part of | 96 | // FIXME if possible, we might want this to be part of |
97 | // OpenFlowSwitch interface so the driver interface isn't misused. | 97 | // OpenFlowSwitch interface so the driver interface isn't misused. |
98 | OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri())); | 98 | OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri())); |
99 | - if (!((OpenFlowSwitchDriver) sw).isConnected()) { | 99 | + if (sw == null || |
100 | + !((OpenFlowSwitchDriver) sw).isConnected()) { | ||
101 | + LOG.error("Failed to probe device {} on sw={}", device, sw); | ||
100 | providerService.deviceDisconnected(device.id()); | 102 | providerService.deviceDisconnected(device.id()); |
101 | return; | 103 | return; |
102 | } | 104 | } | ... | ... |
-
Please register or login to post a comment