Thomas Vachuska
Committed by Gerrit Code Review

Added a detection mechanism for long-running or dead-locked event sinks and listeners.

Change-Id: I21308b058902a94c31c34c2ec2878cd13213874e
......@@ -51,6 +51,7 @@ public class DefaultEventSinkRegistry implements EventSinkRegistry {
@Override
@SuppressWarnings("unchecked")
public <E extends Event> EventSink<E> getSink(Class<E> eventClass) {
checkNotNull(eventClass, "Event class cannot be null");
return (EventSink<E>) sinks.get(eventClass);
}
......
......@@ -27,4 +27,10 @@ public interface EventSink<E extends Event> {
*/
void process(E event);
/**
* Handles notification that event processing time limit has been exceeded.
*/
default void onProcessLimit() {
}
}
......
......@@ -35,6 +35,9 @@ public class ListenerRegistry<E extends Event, L extends EventListener<E>>
private volatile boolean shutdown = false;
private long lastStart;
private L lastListener;
/**
* Set of listeners that have registered.
*/
......@@ -67,13 +70,26 @@ public class ListenerRegistry<E extends Event, L extends EventListener<E>>
public void process(E event) {
for (L listener : listeners) {
try {
lastListener = listener;
lastStart = System.currentTimeMillis();
listener.event(event);
lastStart = 0;
} catch (Exception error) {
reportProblem(event, error);
}
}
}
@Override
public void onProcessLimit() {
if (lastStart > 0) {
log.error("Listener {} exceeded execution time limit: {} ms; ejected",
lastListener.getClass().getName(),
System.currentTimeMillis() - lastStart);
removeListener(lastListener);
}
}
/**
* Predicate indicating whether we should throw an exception if the
* argument to {@link #removeListener} is not in the current set of
......
......@@ -19,6 +19,7 @@ import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Service;
import org.onlab.util.SharedExecutors;
import org.onosproject.event.AbstractEvent;
import org.onosproject.event.DefaultEventSinkRegistry;
import org.onosproject.event.Event;
......@@ -26,8 +27,10 @@ import org.onosproject.event.EventDeliveryService;
import org.onosproject.event.EventSink;
import org.slf4j.Logger;
import java.util.TimerTask;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import static java.util.concurrent.Executors.newSingleThreadExecutor;
......@@ -42,8 +45,14 @@ import static org.slf4j.LoggerFactory.getLogger;
public class CoreEventDispatcher extends DefaultEventSinkRegistry
implements EventDeliveryService {
// Maximum number of millis a sink can take to process an event.
private static final long MAX_EXECUTE_MS = 1_000;
private static final long WATCHDOG_MS = MAX_EXECUTE_MS / 4;
private final Logger log = getLogger(getClass());
private final BlockingQueue<Event> events = new LinkedBlockingQueue<>();
private final ExecutorService executor =
newSingleThreadExecutor(groupedThreads("onos/event", "dispatch-%d"));
......@@ -51,34 +60,45 @@ public class CoreEventDispatcher extends DefaultEventSinkRegistry
private static final Event KILL_PILL = new AbstractEvent(null, 0) {
};
private final BlockingQueue<Event> events = new LinkedBlockingQueue<>();
private DispatchLoop dispatchLoop;
private volatile boolean stopped = false;
// Means to detect long-running sinks
private TimerTask watchdog;
private EventSink lastSink;
private long lastStart = 0;
private Future<?> dispatchFuture;
@Override
public void post(Event event) {
events.add(event);
if (!events.add(event)) {
log.error("Unable to post event {}", event);
}
}
@Activate
public void activate() {
stopped = false;
executor.execute(new DispatchLoop());
dispatchLoop = new DispatchLoop();
dispatchFuture = executor.submit(dispatchLoop);
watchdog = new Watchdog();
SharedExecutors.getTimer().schedule(watchdog, WATCHDOG_MS, WATCHDOG_MS);
log.info("Started");
}
@Deactivate
public void deactivate() {
stopped = true;
dispatchLoop.stop();
watchdog.cancel();
post(KILL_PILL);
log.info("Stopped");
}
// Auxiliary event dispatching loop that feeds off the events queue.
private class DispatchLoop implements Runnable {
private volatile boolean stopped;
@Override
@SuppressWarnings("unchecked")
public void run() {
stopped = false;
log.info("Dispatch loop initiated");
while (!stopped) {
try {
......@@ -87,22 +107,52 @@ public class CoreEventDispatcher extends DefaultEventSinkRegistry
if (event == KILL_PILL) {
break;
}
// Locate the sink for the event class and use it to
// process the event
EventSink sink = getSink(event.getClass());
if (sink != null) {
sink.process(event);
} else {
log.warn("No sink registered for event class {}",
event.getClass());
}
process(event);
} catch (Exception e) {
log.warn("Error encountered while dispatching event:", e);
}
}
log.info("Dispatch loop terminated");
}
// Locate the sink for the event class and use it to process the event
@SuppressWarnings("unchecked")
private void process(Event event) {
EventSink sink = getSink(event.getClass());
if (sink != null) {
lastSink = sink;
lastStart = System.currentTimeMillis();
sink.process(event);
lastStart = 0;
} else {
log.warn("No sink registered for event class {}",
event.getClass());
}
}
void stop() {
stopped = true;
}
}
// Monitors event sinks to make sure none take too long to execute.
private class Watchdog extends TimerTask {
@Override
public void run() {
long delta = System.currentTimeMillis() - lastStart;
if (lastStart > 0 && delta > MAX_EXECUTE_MS) {
log.error("Event sink {} exceeded execution time limit: {} ms",
lastSink.getClass().getName(), delta);
// Notify the sink that it has exceeded its time limit.
lastSink.onProcessLimit();
// Cancel the old dispatch loop and submit a new one.
dispatchLoop.stop();
dispatchLoop = new DispatchLoop();
dispatchFuture.cancel(true);
dispatchFuture = executor.submit(dispatchLoop);
}
}
}
}
......
......@@ -58,6 +58,7 @@ import org.slf4j.Logger;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
......@@ -423,8 +424,8 @@ public class DeviceManager
}
@Override
public void receivedRoleReply(
DeviceId deviceId, MastershipRole requested, MastershipRole response) {
public void receivedRoleReply(DeviceId deviceId, MastershipRole requested,
MastershipRole response) {
// Several things can happen here:
// 1. request and response match
// 2. request and response don't match
......@@ -436,7 +437,7 @@ public class DeviceManager
// FIXME: implement response to this notification
log.info("got reply to a role request for {}: asked for {}, and got {}",
deviceId, requested, response);
deviceId, requested, response);
if (requested == null && response == null) {
// something was off with DeviceProvider, maybe check channel too?
......@@ -445,9 +446,8 @@ public class DeviceManager
return;
}
if (requested.equals(response)) {
if (requested.equals(mastershipService.getLocalRole(deviceId))) {
if (Objects.equals(requested, response)) {
if (Objects.equals(requested, mastershipService.getLocalRole(deviceId))) {
return;
} else {
return;
......@@ -464,19 +464,16 @@ public class DeviceManager
//post(new DeviceEvent(DEVICE_MASTERSHIP_CHANGED, device));
}
}
}
@Override
public void updatePortStatistics(DeviceId deviceId, Collection<PortStatistics> portStatistics) {
checkNotNull(deviceId, DEVICE_ID_NULL);
checkNotNull(portStatistics,
"Port statistics list cannot be null");
checkNotNull(portStatistics, "Port statistics list cannot be null");
checkValidity();
DeviceEvent event = store.updatePortStatistics(this.provider().id(),
deviceId, portStatistics);
deviceId, portStatistics);
post(event);
}
}
......@@ -634,8 +631,7 @@ public class DeviceManager
}
// Store delegate to re-post events emitted from the store.
private class InternalStoreDelegate
implements DeviceStoreDelegate {
private class InternalStoreDelegate implements DeviceStoreDelegate {
@Override
public void notify(DeviceEvent event) {
post(event);
......