Jonathan Hart

Automatically rebalance intent key partitions on cluster change.

Also sorted output of leaders command by leader IP.

Change-Id: Ie85896a4f6f50489ebd7994c905808ce34fca94c
......@@ -20,6 +20,7 @@ import org.onosproject.cli.AbstractShellCommand;
import org.onosproject.cluster.Leadership;
import org.onosproject.cluster.LeadershipService;
import java.util.Comparator;
import java.util.Map;
/**
......@@ -29,17 +30,32 @@ import java.util.Map;
description = "Finds the leader for particular topic.")
public class LeaderCommand extends AbstractShellCommand {
private static final String FMT = "%-20s: %15s %5s";
private static final String FMT = "%-20s: %15s %15s";
@Override
protected void execute() {
LeadershipService leaderService = get(LeadershipService.class);
Map<String, Leadership> leaderBoard = leaderService.getLeaderBoard();
print(FMT, "Topic", "Leader", "Epoch");
for (String topic : leaderBoard.keySet()) {
Leadership leadership = leaderBoard.get(topic);
print(FMT, topic, leadership.leader(), leadership.epoch());
Comparator<Leadership> leadershipComparator =
(e1, e2) -> {
if (e1.leader() == null && e2.leader() == null) {
return 0;
}
if (e1.leader() == null) {
return 1;
}
if (e2.leader() == null) {
return -1;
}
return e1.leader().toString().compareTo(e2.leader().toString());
};
leaderBoard.values()
.stream()
.sorted(leadershipComparator)
.forEach(l -> print(FMT, l.topic(), l.leader(), l.epoch()));
}
}
......
......@@ -24,17 +24,26 @@ import java.util.Objects;
* processed by a single ONOS instance at a time.
*/
public class PartitionId {
private final long id;
private final int id;
/**
* Creates a new partition ID.
*
* @param id the partition ID
*/
PartitionId(long id) {
PartitionId(int id) {
this.id = id;
}
/**
* Returns the integer ID value.
*
* @return ID value
*/
public int value() {
return id;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof PartitionId)) {
......
......@@ -21,7 +21,10 @@ import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.Service;
import org.onosproject.cluster.ClusterEvent;
import org.onosproject.cluster.ClusterEventListener;
import org.onosproject.cluster.ClusterService;
import org.onosproject.cluster.ControllerNode;
import org.onosproject.cluster.Leadership;
import org.onosproject.cluster.LeadershipEvent;
import org.onosproject.cluster.LeadershipEventListener;
......@@ -31,8 +34,12 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
/**
* Manages the assignment of intent keyspace partitions to instances.
......@@ -49,35 +56,48 @@ public class PartitionManager implements PartitionService {
@Reference(cardinality = ReferenceCardinality.MANDATORY_UNARY)
protected ClusterService clusterService;
// TODO make configurable
private static final int NUM_PARTITIONS = 100;
private static final int NUM_PARTITIONS = 32;
private static final int BACKOFF_TIME = 2;
private static final int CHECK_PERIOD = 10;
private static final String ELECTION_PREFIX = "intent-partition-";
private LeadershipEventListener leaderListener = new InternalLeadershipListener();
private ClusterEventListener clusterListener = new InternalClusterEventListener();
private Set<PartitionId> myPartitions;
private final Set<PartitionId> myPartitions
= Collections.newSetFromMap(new ConcurrentHashMap<>());
private ScheduledExecutorService executor = Executors
.newScheduledThreadPool(1);
@Activate
public void activate() {
myPartitions = Collections.newSetFromMap(new ConcurrentHashMap<>());
leadershipService.addListener(leaderListener);
clusterService.addListener(clusterListener);
for (int i = 0; i < NUM_PARTITIONS; i++) {
leadershipService.runForLeadership(ELECTION_PREFIX + i);
leadershipService.runForLeadership(getPartitionPath(i));
}
executor.scheduleAtFixedRate(this::doRelinquish, 0,
CHECK_PERIOD, TimeUnit.SECONDS);
}
@Deactivate
public void deactivate() {
leadershipService.removeListener(leaderListener);
clusterService.removeListener(clusterListener);
}
private String getPartitionPath(int i) {
return ELECTION_PREFIX + i;
}
private PartitionId getPartitionForKey(Key intentKey) {
log.debug("Getting partition for {}: {}", intentKey,
new PartitionId(Math.abs(intentKey.hash()) % NUM_PARTITIONS));
return new PartitionId(Math.abs(intentKey.hash()) % NUM_PARTITIONS);
new PartitionId((int) Math.abs(intentKey.hash()) % NUM_PARTITIONS));
return new PartitionId((int) Math.abs(intentKey.hash()) % NUM_PARTITIONS);
}
@Override
......@@ -85,6 +105,58 @@ public class PartitionManager implements PartitionService {
return myPartitions.contains(getPartitionForKey(intentKey));
}
private void doRelinquish() {
try {
relinquish();
} catch (Exception e) {
log.warn("Exception caught during relinquish task", e);
}
}
/**
* Determine whether we have more than our fair share of partitions, and if
* so, relinquish leadership of some of them for a little while to let
* other instances take over.
*/
private void relinquish() {
int activeNodes = (int) clusterService.getNodes()
.stream()
.filter(n -> clusterService.getState(n.id())
== ControllerNode.State.ACTIVE)
.count();
int myShare = (int) Math.ceil((double) NUM_PARTITIONS / activeNodes);
synchronized (myPartitions) {
int relinquish = myPartitions.size() - myShare;
if (relinquish <= 0) {
return;
}
Iterator<PartitionId> it = myPartitions.iterator();
for (int i = 0; i < relinquish; i++) {
PartitionId id = it.next();
it.remove();
leadershipService.withdraw(getPartitionPath(id.value()));
executor.schedule(() -> recontest(getPartitionPath(id.value())),
BACKOFF_TIME, TimeUnit.SECONDS);
}
}
}
/**
* Try and recontest for leadership of a partition.
*
* @param path topic name to recontest
*/
private void recontest(String path) {
leadershipService.runForLeadership(path);
}
private final class InternalLeadershipListener implements LeadershipEventListener {
@Override
......@@ -109,12 +181,26 @@ public class PartitionManager implements PartitionService {
return;
}
synchronized (myPartitions) {
if (event.type() == LeadershipEvent.Type.LEADER_ELECTED) {
myPartitions.add(new PartitionId(partitionId));
} else if (event.type() == LeadershipEvent.Type.LEADER_BOOTED) {
myPartitions.remove(new PartitionId(partitionId));
}
}
// See if we need to let some partitions go
relinquish();
}
}
}
private final class InternalClusterEventListener implements
ClusterEventListener {
@Override
public void event(ClusterEvent event) {
relinquish();
}
}
}
......