Skip to content

Commit 2e5073b

Browse files
author
Andrew Choi
authored
Kafka 2.4 + KIP-455 for Xinfra Monitor Multie Cluster Topic Management Service (#317)
Kafka 2.4 + KIP-455 for Xinfra Monitor Multi Cluster Topic Management Service Total extends CumulativeSum : A non-sampled cumulative total maintained over all time. This is a non-sampled version of {@link WindowedSum}. Total is deprecated since 2.4. Recommended to use {@link CumulativeSum} instead. (New) CumulativeSum: A non-sampled cumulative total maintained over all time. This is a non-sampled version of {@link WindowedSum}.
1 parent ae646d6 commit 2e5073b

File tree

4 files changed

+100
-38
lines changed

4 files changed

+100
-38
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ allprojects {
3838
compile 'net.savantly:graphite-client:1.1.0-RELEASE'
3939
compile 'com.timgroup:java-statsd-client:3.0.1'
4040
compile 'com.signalfx.public:signalfx-codahale:0.0.47'
41-
compile group: 'org.apache.kafka', name: 'kafka_2.12', version: '2.3.1'
41+
compile group: 'org.apache.kafka', name: 'kafka_2.12', version: '2.4.0'
4242
compile group: 'org.apache.kafka', name: 'kafka-clients', version: '2.3.1'
4343
testCompile 'org.mockito:mockito-core:2.24.0'
4444
testCompile 'org.testng:testng:6.8.8'

src/main/java/com/linkedin/xinfra/monitor/common/Utils.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import java.util.Random;
2626
import java.util.Set;
2727
import java.util.concurrent.ExecutionException;
28+
import java.util.concurrent.TimeUnit;
29+
import java.util.concurrent.TimeoutException;
2830
import javax.management.MBeanAttributeInfo;
2931
import javax.management.MBeanInfo;
3032
import javax.management.MBeanServer;
@@ -37,7 +39,10 @@
3739
import org.apache.avro.io.JsonEncoder;
3840
import org.apache.kafka.clients.admin.AdminClient;
3941
import org.apache.kafka.clients.admin.CreateTopicsResult;
42+
import org.apache.kafka.clients.admin.ListPartitionReassignmentsResult;
4043
import org.apache.kafka.clients.admin.NewTopic;
44+
import org.apache.kafka.clients.admin.PartitionReassignment;
45+
import org.apache.kafka.common.TopicPartition;
4146
import org.apache.kafka.common.errors.TopicExistsException;
4247
import org.json.JSONObject;
4348
import org.slf4j.Logger;
@@ -51,6 +56,10 @@ public class Utils {
5156
private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
5257
public static final int ZK_CONNECTION_TIMEOUT_MS = 30_000;
5358
public static final int ZK_SESSION_TIMEOUT_MS = 30_000;
59+
private static final long LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS = 60000L;
60+
private static final int LIST_PARTITION_REASSIGNMENTS_MAX_ATTEMPTS = 3;
61+
private static final String LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS_CONFIG = "list.partition.reassignment.timeout.ms";
62+
private static final int DEFAULT_RETRY_BACKOFF_BASE = 2;
5463

5564
public static String prettyPrint(Object value) throws JsonProcessingException {
5665
ObjectMapper objectMapper = new ObjectMapper();
@@ -61,6 +70,41 @@ public static String prettyPrint(Object value) throws JsonProcessingException {
6170
return written;
6271
}
6372

73+
/**
74+
* Retrieve the map of {@link PartitionReassignment reassignment} by {@link TopicPartition partitions}.
75+
*
76+
* If the response times out, the method retries up to {@link #LIST_PARTITION_REASSIGNMENTS_MAX_ATTEMPTS} times.
77+
* The max time to wait for the {@link AdminClient adminClient} response is computed.
78+
*
79+
* @param adminClient The {@link AdminClient adminClient} to ask for ongoing partition reassignments
80+
* @return The map of {@link PartitionReassignment reassignment} by {@link TopicPartition partitions}
81+
*/
82+
public static Map<TopicPartition, PartitionReassignment> ongoingPartitionReassignments(AdminClient adminClient)
83+
throws InterruptedException, ExecutionException, TimeoutException {
84+
Map<TopicPartition, PartitionReassignment> partitionReassignments = null;
85+
int attempts = 0;
86+
long timeoutMs = LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS;
87+
do {
88+
ListPartitionReassignmentsResult responseResult = adminClient.listPartitionReassignments();
89+
try {
90+
// A successful response is expected to be non-null.
91+
partitionReassignments = responseResult.reassignments().get(timeoutMs, TimeUnit.MILLISECONDS);
92+
} catch (TimeoutException timeoutException) {
93+
LOG.info(
94+
"Xinfra Monitor has failed to list partition reassignments in {}ms (attempt={}). "
95+
+ "Please consider increasing the value of {} config.",
96+
timeoutMs, 1 + attempts, LIST_PARTITION_REASSIGNMENTS_TIMEOUT_MS_CONFIG);
97+
attempts++;
98+
if (attempts == LIST_PARTITION_REASSIGNMENTS_MAX_ATTEMPTS) {
99+
throw timeoutException;
100+
}
101+
timeoutMs *= DEFAULT_RETRY_BACKOFF_BASE;
102+
}
103+
} while (partitionReassignments == null);
104+
105+
return partitionReassignments;
106+
}
107+
64108
public static List<Integer> replicaIdentifiers(Set<BrokerMetadata> brokers) {
65109
if (brokers == null || brokers.size() == 0) {
66110
throw new IllegalArgumentException("brokers are either null or empty.");

src/main/java/com/linkedin/xinfra/monitor/services/ConsumeService.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@
1111
package com.linkedin.xinfra.monitor.services;
1212

1313
import com.linkedin.xinfra.monitor.common.DefaultTopicSchema;
14-
import com.linkedin.xinfra.monitor.services.metrics.CommitAvailabilityMetrics;
15-
import com.linkedin.xinfra.monitor.services.metrics.CommitLatencyMetrics;
16-
import com.linkedin.xinfra.monitor.services.metrics.ConsumeMetrics;
1714
import com.linkedin.xinfra.monitor.common.Utils;
1815
import com.linkedin.xinfra.monitor.consumer.BaseConsumerRecord;
1916
import com.linkedin.xinfra.monitor.consumer.KMBaseConsumer;
17+
import com.linkedin.xinfra.monitor.services.metrics.CommitAvailabilityMetrics;
18+
import com.linkedin.xinfra.monitor.services.metrics.CommitLatencyMetrics;
19+
import com.linkedin.xinfra.monitor.services.metrics.ConsumeMetrics;
2020
import java.util.ArrayList;
2121
import java.util.Collections;
2222
import java.util.HashMap;
@@ -40,7 +40,7 @@
4040
import org.apache.kafka.common.metrics.Metrics;
4141
import org.apache.kafka.common.metrics.MetricsReporter;
4242
import org.apache.kafka.common.metrics.Sensor;
43-
import org.apache.kafka.common.metrics.stats.Total;
43+
import org.apache.kafka.common.metrics.stats.CumulativeSum;
4444
import org.apache.kafka.common.utils.SystemTime;
4545
import org.slf4j.Logger;
4646
import org.slf4j.LoggerFactory;
@@ -244,7 +244,8 @@ public synchronized void start() {
244244
@SuppressWarnings("ConstantConditions")
245245
double partitionCount = topicDescription.partitions().size();
246246
topicPartitionCount.add(
247-
new MetricName("topic-partitions-count", METRIC_GROUP_NAME, "The total number of partitions for the topic.", tags), new Total(partitionCount));
247+
new MetricName("topic-partitions-count", METRIC_GROUP_NAME, "The total number of partitions for the topic.",
248+
tags), new CumulativeSum(partitionCount));
248249
}
249250
}
250251

src/main/java/com/linkedin/xinfra/monitor/services/MultiClusterTopicManagementService.java

Lines changed: 49 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
package com.linkedin.xinfra.monitor.services;
1212

1313
import com.linkedin.xinfra.monitor.common.Utils;
14-
import com.linkedin.xinfra.monitor.services.configs.MultiClusterTopicManagementServiceConfig;
15-
import com.linkedin.xinfra.monitor.topicfactory.TopicFactory;
1614
import com.linkedin.xinfra.monitor.services.configs.CommonServiceConfig;
15+
import com.linkedin.xinfra.monitor.services.configs.MultiClusterTopicManagementServiceConfig;
1716
import com.linkedin.xinfra.monitor.services.configs.TopicManagementServiceConfig;
17+
import com.linkedin.xinfra.monitor.topicfactory.TopicFactory;
1818
import java.io.IOException;
1919
import java.util.ArrayList;
2020
import java.util.Collection;
@@ -23,6 +23,7 @@
2323
import java.util.HashSet;
2424
import java.util.List;
2525
import java.util.Map;
26+
import java.util.Optional;
2627
import java.util.Properties;
2728
import java.util.Random;
2829
import java.util.Set;
@@ -31,15 +32,18 @@
3132
import java.util.concurrent.Executors;
3233
import java.util.concurrent.ScheduledExecutorService;
3334
import java.util.concurrent.TimeUnit;
35+
import java.util.concurrent.TimeoutException;
3436
import java.util.concurrent.atomic.AtomicBoolean;
3537
import kafka.admin.AdminUtils;
3638
import kafka.admin.BrokerMetadata;
3739
import kafka.server.ConfigType;
3840
import kafka.zk.KafkaZkClient;
3941
import org.apache.kafka.clients.admin.AdminClient;
4042
import org.apache.kafka.clients.admin.AdminClientConfig;
43+
import org.apache.kafka.clients.admin.AlterPartitionReassignmentsResult;
4144
import org.apache.kafka.clients.admin.CreatePartitionsResult;
4245
import org.apache.kafka.clients.admin.ElectPreferredLeadersResult;
46+
import org.apache.kafka.clients.admin.NewPartitionReassignment;
4347
import org.apache.kafka.clients.admin.NewPartitions;
4448
import org.apache.kafka.clients.admin.NewTopic;
4549
import org.apache.kafka.clients.admin.TopicDescription;
@@ -54,6 +58,7 @@
5458
import org.slf4j.Logger;
5559
import org.slf4j.LoggerFactory;
5660
import scala.Option$;
61+
import scala.collection.JavaConverters;
5762
import scala.collection.Seq;
5863

5964

@@ -184,7 +189,7 @@ public void run() {
184189
TopicManagementHelper helper = entry.getValue();
185190
try {
186191
helper.maybeReassignPartitionAndElectLeader();
187-
} catch (IOException | KafkaException e) {
192+
} catch (KafkaException e) {
188193
LOGGER.warn(_serviceName + "/MultiClusterTopicManagementService will retry later in cluster " + clusterName, e);
189194
}
190195
}
@@ -405,7 +410,7 @@ private Set<Node> getAvailableBrokers() throws ExecutionException, InterruptedEx
405410
return brokers;
406411
}
407412

408-
void maybeReassignPartitionAndElectLeader() throws Exception {
413+
void maybeReassignPartitionAndElectLeader() throws ExecutionException, InterruptedException, TimeoutException {
409414
try (KafkaZkClient zkClient = KafkaZkClient.apply(_zkConnect, JaasUtils.isZkSecurityEnabled(),
410415
Utils.ZK_SESSION_TIMEOUT_MS, Utils.ZK_CONNECTION_TIMEOUT_MS,
411416
Integer.MAX_VALUE, Time.SYSTEM, METRIC_GROUP_NAME, "SessionExpireListener", null)) {
@@ -426,13 +431,12 @@ void maybeReassignPartitionAndElectLeader() throws Exception {
426431
"Configured replication factor {} is smaller than the current replication factor {} of the topic {} in cluster.",
427432
_replicationFactor, currentReplicationFactor, _topic);
428433

429-
if (expectedReplicationFactor > currentReplicationFactor && !zkClient
430-
.reassignPartitionsInProgress()) {
434+
if (expectedReplicationFactor > currentReplicationFactor
435+
&& Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) {
431436
LOGGER.info(
432437
"MultiClusterTopicManagementService will increase the replication factor of the topic {} in cluster"
433438
+ "from {} to {}", _topic, currentReplicationFactor, expectedReplicationFactor);
434-
reassignPartitions(zkClient, brokers, _topic, partitionInfoList.size(),
435-
expectedReplicationFactor);
439+
reassignPartitions(_adminClient, brokers, _topic, partitionInfoList.size(), expectedReplicationFactor);
436440
partitionReassigned = true;
437441
}
438442

@@ -450,19 +454,16 @@ void maybeReassignPartitionAndElectLeader() throws Exception {
450454
zkClient.setOrCreateEntityConfigs(ConfigType.Topic(), _topic, expectedProperties);
451455
}
452456

453-
if (partitionInfoList.size() >= brokers.size() &&
454-
someBrokerNotPreferredLeader(partitionInfoList, brokers) && !zkClient
455-
.reassignPartitionsInProgress()) {
456-
LOGGER.info("{} will reassign partitions of the topic {} in cluster.",
457-
this.getClass().toString(), _topic);
458-
reassignPartitions(zkClient, brokers, _topic, partitionInfoList.size(),
459-
expectedReplicationFactor);
457+
if (partitionInfoList.size() >= brokers.size() && someBrokerNotPreferredLeader(partitionInfoList, brokers)
458+
&& Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) {
459+
LOGGER.info("{} will reassign partitions of the topic {} in cluster.", this.getClass().toString(), _topic);
460+
reassignPartitions(_adminClient, brokers, _topic, partitionInfoList.size(), expectedReplicationFactor);
460461
partitionReassigned = true;
461462
}
462463

463464
if (partitionInfoList.size() >= brokers.size() &&
464465
someBrokerNotElectedLeader(partitionInfoList, brokers)) {
465-
if (!partitionReassigned || !zkClient.reassignPartitionsInProgress()) {
466+
if (!partitionReassigned || Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) {
466467
LOGGER.info(
467468
"MultiClusterTopicManagementService will trigger preferred leader election for the topic {} in "
468469
+ "cluster.", _topic
@@ -483,7 +484,7 @@ void maybeElectLeader() throws Exception {
483484

484485
try (KafkaZkClient zkClient = KafkaZkClient.apply(_zkConnect, JaasUtils.isZkSecurityEnabled(), Utils.ZK_SESSION_TIMEOUT_MS,
485486
Utils.ZK_CONNECTION_TIMEOUT_MS, Integer.MAX_VALUE, Time.SYSTEM, METRIC_GROUP_NAME, "SessionExpireListener", null)) {
486-
if (!zkClient.reassignPartitionsInProgress()) {
487+
if (Utils.ongoingPartitionReassignments(_adminClient).isEmpty()) {
487488
List<TopicPartitionInfo> partitionInfoList = _adminClient
488489
.describeTopics(Collections.singleton(_topic)).all().get().get(_topic).partitions();
489490
LOGGER.info(
@@ -495,8 +496,7 @@ void maybeElectLeader() throws Exception {
495496
}
496497
}
497498

498-
private void triggerPreferredLeaderElection(List<TopicPartitionInfo> partitionInfoList, String partitionTopic)
499-
throws ExecutionException, InterruptedException {
499+
private void triggerPreferredLeaderElection(List<TopicPartitionInfo> partitionInfoList, String partitionTopic) {
500500
Collection<TopicPartition> partitions = new HashSet<>();
501501
for (TopicPartitionInfo javaPartitionInfo : partitionInfoList) {
502502
partitions.add(new TopicPartition(partitionTopic, javaPartitionInfo.partition()));
@@ -506,37 +506,54 @@ private void triggerPreferredLeaderElection(List<TopicPartitionInfo> partitionIn
506506
LOGGER.info("{}: triggerPreferredLeaderElection - {}", this.getClass().toString(), electPreferredLeadersResult.all());
507507
}
508508

509-
private static void reassignPartitions(KafkaZkClient zkClient, Collection<Node> brokers, String topic, int partitionCount, int replicationFactor) {
510-
scala.collection.mutable.ArrayBuffer<BrokerMetadata> brokersMetadata = new scala.collection.mutable.ArrayBuffer<>(brokers.size());
509+
private static void reassignPartitions(AdminClient adminClient, Collection<Node> brokers, String topic,
510+
int partitionCount, int replicationFactor) {
511+
512+
scala.collection.mutable.ArrayBuffer<BrokerMetadata> brokersMetadata =
513+
new scala.collection.mutable.ArrayBuffer<>(brokers.size());
511514
for (Node broker : brokers) {
512515
brokersMetadata.$plus$eq(new BrokerMetadata(broker.id(), Option$.MODULE$.apply(broker.rack())));
513516
}
514517
scala.collection.Map<Object, Seq<Object>> assignedReplicas =
515518
AdminUtils.assignReplicasToBrokers(brokersMetadata, partitionCount, replicationFactor, 0, 0);
516-
517-
scala.collection.immutable.Map<TopicPartition, Seq<Object>> newAssignment = new scala.collection.immutable.HashMap<>();
519+
scala.collection.immutable.Map<TopicPartition, Seq<Object>> newAssignment =
520+
new scala.collection.immutable.HashMap<>();
518521
scala.collection.Iterator<scala.Tuple2<Object, scala.collection.Seq<Object>>> it = assignedReplicas.iterator();
519522
while (it.hasNext()) {
520523
scala.Tuple2<Object, scala.collection.Seq<Object>> scalaTuple = it.next();
521524
TopicPartition tp = new TopicPartition(topic, (Integer) scalaTuple._1);
522525
newAssignment = newAssignment.$plus(new scala.Tuple2<>(tp, scalaTuple._2));
523526
}
524527

525-
scala.collection.immutable.Set<String> topicList = new scala.collection.immutable.Set.Set1<>(topic);
526-
scala.collection.Map<Object, scala.collection.Seq<Object>> currentAssignment =
527-
zkClient.getPartitionAssignmentForTopics(topicList).apply(topic);
528-
String currentAssignmentJson = formatAsNewReassignmentJson(topic, currentAssignment);
529528
String newAssignmentJson = formatAsNewReassignmentJson(topic, assignedReplicas);
530-
531529
LOGGER.info("Reassign partitions for topic " + topic);
532-
LOGGER.info("Current partition replica assignment " + currentAssignmentJson);
533-
LOGGER.info("New partition replica assignment " + newAssignmentJson);
534-
zkClient.createPartitionReassignment(newAssignment);
530+
LOGGER.info("New topic partition replica assignments: {}", newAssignmentJson);
531+
532+
Set<Map.Entry<TopicPartition, Seq<Object>>> newAssignmentMap =
533+
scala.collection.JavaConverters.mapAsJavaMap(newAssignment).entrySet();
534+
Map<TopicPartition, Optional<NewPartitionReassignment>> reassignments = new HashMap<>();
535+
for (Map.Entry<TopicPartition, Seq<Object>> topicPartitionSeqEntry : newAssignmentMap) {
536+
List<Integer> targetReplicas = new ArrayList<>();
537+
List<Object> replicas = JavaConverters.seqAsJavaList(topicPartitionSeqEntry.getValue());
538+
for (Object replica : replicas) {
539+
targetReplicas.add((int) replica);
540+
}
541+
NewPartitionReassignment newPartitionReassignment = new NewPartitionReassignment(targetReplicas);
542+
reassignments.put(topicPartitionSeqEntry.getKey(), Optional.of(newPartitionReassignment));
543+
}
544+
AlterPartitionReassignmentsResult alterPartitionReassignmentsResult =
545+
adminClient.alterPartitionReassignments(reassignments);
546+
try {
547+
alterPartitionReassignmentsResult.all().get();
548+
} catch (InterruptedException | ExecutionException e) {
549+
LOGGER.error("An exception occurred while altering the partition reassignments for {}", topic, e);
550+
}
535551
}
536552

537553
static int getReplicationFactor(List<TopicPartitionInfo> partitionInfoList) {
538-
if (partitionInfoList.isEmpty())
554+
if (partitionInfoList.isEmpty()) {
539555
throw new RuntimeException("Partition list is empty.");
556+
}
540557

541558
int replicationFactor = partitionInfoList.get(0).replicas().size();
542559
for (TopicPartitionInfo partitionInfo : partitionInfoList) {

0 commit comments

Comments
 (0)