Skip to content

Commit 86aecc9

Browse files
feat: Spark micro batch processing (#426)
1 parent ff05b1a commit 86aecc9

File tree

11 files changed

+669
-37
lines changed

11 files changed

+669
-37
lines changed

google-cloud-pubsublite/src/main/java/com/google/cloud/pubsublite/internal/wire/SubscriberFactory.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
import com.google.api.gax.rpc.ApiException;
2020
import com.google.cloud.pubsublite.SequencedMessage;
2121
import com.google.common.collect.ImmutableList;
22+
import java.io.Serializable;
2223
import java.util.function.Consumer;
2324

24-
public interface SubscriberFactory {
25+
public interface SubscriberFactory extends Serializable {
2526
Subscriber newSubscriber(Consumer<ImmutableList<SequencedMessage>> message_consumer)
2627
throws ApiException;
2728
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.cloud.pubsublite.spark;
18+
19+
import com.google.cloud.pubsublite.TopicPath;
20+
import com.google.cloud.pubsublite.internal.CheckedApiException;
21+
import java.io.Closeable;
22+
23+
public interface HeadOffsetReader extends Closeable {
24+
25+
// Gets the head offsets for all partitions in a topic. Blocks.
26+
PslSourceOffset getHeadOffset(TopicPath topic) throws CheckedApiException;
27+
28+
@Override
29+
void close();
30+
}

pubsublite-spark-sql-streaming/src/main/java/com/google/cloud/pubsublite/spark/PslContinuousReader.java

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,17 +16,13 @@
1616

1717
package com.google.cloud.pubsublite.spark;
1818

19-
import com.google.cloud.pubsublite.Partition;
2019
import com.google.cloud.pubsublite.SubscriptionPath;
2120
import com.google.cloud.pubsublite.cloudpubsub.FlowControlSettings;
2221
import com.google.cloud.pubsublite.internal.CursorClient;
2322
import com.google.common.annotations.VisibleForTesting;
2423
import java.util.Arrays;
25-
import java.util.HashMap;
2624
import java.util.List;
27-
import java.util.Map;
2825
import java.util.Optional;
29-
import java.util.concurrent.ExecutionException;
3026
import java.util.stream.Collectors;
3127
import org.apache.spark.sql.catalyst.InternalRow;
3228
import org.apache.spark.sql.sources.v2.reader.InputPartition;
@@ -84,23 +80,8 @@ public void setStartOffset(Optional<Offset> start) {
8480
startOffset = (SparkSourceOffset) start.get();
8581
return;
8682
}
87-
try {
88-
Map<Partition, com.google.cloud.pubsublite.Offset> pslSourceOffsetMap = new HashMap<>();
89-
for (int i = 0; i < topicPartitionCount; i++) {
90-
pslSourceOffsetMap.put(Partition.of(i), com.google.cloud.pubsublite.Offset.of(0));
91-
}
92-
cursorClient
93-
.listPartitionCursors(subscriptionPath)
94-
.get()
95-
.entrySet()
96-
.forEach((e) -> pslSourceOffsetMap.replace(e.getKey(), e.getValue()));
97-
startOffset =
98-
PslSparkUtils.toSparkSourceOffset(
99-
PslSourceOffset.builder().partitionOffsetMap(pslSourceOffsetMap).build());
100-
} catch (InterruptedException | ExecutionException e) {
101-
throw new IllegalStateException(
102-
"Failed to get information from PSL and construct startOffset", e);
103-
}
83+
startOffset =
84+
PslSparkUtils.getSparkStartOffset(cursorClient, subscriptionPath, topicPartitionCount);
10485
}
10586

10687
@Override
@@ -123,13 +104,13 @@ public StructType readSchema() {
123104

124105
@Override
125106
public List<InputPartition<InternalRow>> planInputPartitions() {
126-
return startOffset.getPartitionOffsetMap().entrySet().stream()
107+
return startOffset.getPartitionOffsetMap().values().stream()
127108
.map(
128-
e ->
109+
v ->
129110
new PslContinuousInputPartition(
130111
SparkPartitionOffset.builder()
131-
.partition(e.getKey())
132-
.offset(e.getValue().offset())
112+
.partition(v.partition())
113+
.offset(v.offset())
133114
.build(),
134115
subscriptionPath,
135116
flowControlSettings))

pubsublite-spark-sql-streaming/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,27 @@
1717
package com.google.cloud.pubsublite.spark;
1818

1919
import com.google.cloud.pubsublite.AdminClient;
20+
import com.google.cloud.pubsublite.Partition;
21+
import com.google.cloud.pubsublite.PartitionLookupUtils;
2022
import com.google.cloud.pubsublite.SubscriptionPath;
2123
import com.google.cloud.pubsublite.TopicPath;
24+
import com.google.cloud.pubsublite.internal.CheckedApiException;
2225
import com.google.cloud.pubsublite.internal.CursorClient;
2326
import com.google.cloud.pubsublite.internal.wire.CommitterBuilder;
24-
import com.google.cloud.pubsublite.proto.Subscription;
27+
import com.google.common.collect.ImmutableMap;
2528
import java.util.Objects;
2629
import java.util.Optional;
27-
import java.util.concurrent.ExecutionException;
2830
import org.apache.spark.sql.sources.DataSourceRegister;
2931
import org.apache.spark.sql.sources.v2.ContinuousReadSupport;
3032
import org.apache.spark.sql.sources.v2.DataSourceOptions;
3133
import org.apache.spark.sql.sources.v2.DataSourceV2;
34+
import org.apache.spark.sql.sources.v2.MicroBatchReadSupport;
3235
import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader;
36+
import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader;
3337
import org.apache.spark.sql.types.StructType;
3438

35-
public class PslDataSource implements DataSourceV2, ContinuousReadSupport, DataSourceRegister {
39+
public class PslDataSource
40+
implements DataSourceV2, ContinuousReadSupport, MicroBatchReadSupport, DataSourceRegister {
3641

3742
@Override
3843
public String shortName() {
@@ -52,16 +57,45 @@ public ContinuousReader createContinuousReader(
5257
CursorClient cursorClient = pslDataSourceOptions.newCursorClient();
5358
AdminClient adminClient = pslDataSourceOptions.newAdminClient();
5459
SubscriptionPath subscriptionPath = pslDataSourceOptions.subscriptionPath();
55-
long topicPartitionCount;
60+
long topicPartitionCount = PartitionLookupUtils.numPartitions(subscriptionPath, adminClient);
61+
MultiPartitionCommitter committer =
62+
new MultiPartitionCommitterImpl(
63+
topicPartitionCount,
64+
(partition) ->
65+
CommitterBuilder.newBuilder()
66+
.setSubscriptionPath(subscriptionPath)
67+
.setPartition(partition)
68+
.setServiceClient(pslDataSourceOptions.newCursorServiceClient())
69+
.build());
70+
return new PslContinuousReader(
71+
cursorClient,
72+
committer,
73+
subscriptionPath,
74+
Objects.requireNonNull(pslDataSourceOptions.flowControlSettings()),
75+
topicPartitionCount);
76+
}
77+
78+
@Override
79+
public MicroBatchReader createMicroBatchReader(
80+
Optional<StructType> schema, String checkpointLocation, DataSourceOptions options) {
81+
if (schema.isPresent()) {
82+
throw new IllegalArgumentException(
83+
"PubSub Lite uses fixed schema and custom schema is not allowed");
84+
}
85+
86+
PslDataSourceOptions pslDataSourceOptions =
87+
PslDataSourceOptions.fromSparkDataSourceOptions(options);
88+
CursorClient cursorClient = pslDataSourceOptions.newCursorClient();
89+
AdminClient adminClient = pslDataSourceOptions.newAdminClient();
90+
SubscriptionPath subscriptionPath = pslDataSourceOptions.subscriptionPath();
91+
TopicPath topicPath;
5692
try {
57-
Subscription sub = adminClient.getSubscription(subscriptionPath).get();
58-
topicPartitionCount =
59-
adminClient.getTopicPartitionCount(TopicPath.parse(sub.getTopic())).get();
60-
} catch (InterruptedException | ExecutionException e) {
93+
topicPath = TopicPath.parse(adminClient.getSubscription(subscriptionPath).get().getTopic());
94+
} catch (Throwable t) {
6195
throw new IllegalStateException(
62-
"Failed to get information of subscription " + pslDataSourceOptions.subscriptionPath(),
63-
e);
96+
"Unable to get topic for subscription " + subscriptionPath, t);
6497
}
98+
long topicPartitionCount = PartitionLookupUtils.numPartitions(topicPath, adminClient);
6599
MultiPartitionCommitter committer =
66100
new MultiPartitionCommitterImpl(
67101
topicPartitionCount,
@@ -71,11 +105,37 @@ public ContinuousReader createContinuousReader(
71105
.setPartition(partition)
72106
.setServiceClient(pslDataSourceOptions.newCursorServiceClient())
73107
.build());
74-
return new PslContinuousReader(
108+
109+
return new PslMicroBatchReader(
75110
cursorClient,
76111
committer,
77112
subscriptionPath,
113+
PslSparkUtils.toSparkSourceOffset(getHeadOffset(topicPath)),
78114
Objects.requireNonNull(pslDataSourceOptions.flowControlSettings()),
79115
topicPartitionCount);
80116
}
117+
118+
private static PslSourceOffset getHeadOffset(TopicPath topicPath) {
119+
// TODO(jiangmichael): Replace it with real implementation.
120+
HeadOffsetReader headOffsetReader =
121+
new HeadOffsetReader() {
122+
@Override
123+
public PslSourceOffset getHeadOffset(TopicPath topic) {
124+
return PslSourceOffset.builder()
125+
.partitionOffsetMap(
126+
ImmutableMap.of(
127+
Partition.of(0), com.google.cloud.pubsublite.Offset.of(50),
128+
Partition.of(1), com.google.cloud.pubsublite.Offset.of(50)))
129+
.build();
130+
}
131+
132+
@Override
133+
public void close() {}
134+
};
135+
try {
136+
return headOffsetReader.getHeadOffset(topicPath);
137+
} catch (CheckedApiException e) {
138+
throw new IllegalStateException("Unable to get head offset for topic " + topicPath, e);
139+
}
140+
}
81141
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.cloud.pubsublite.spark;
18+
19+
import com.google.cloud.pubsublite.SubscriptionPath;
20+
import com.google.cloud.pubsublite.cloudpubsub.FlowControlSettings;
21+
import com.google.cloud.pubsublite.internal.BlockingPullSubscriber;
22+
import com.google.cloud.pubsublite.internal.BlockingPullSubscriberImpl;
23+
import com.google.cloud.pubsublite.internal.CheckedApiException;
24+
import com.google.cloud.pubsublite.internal.wire.SubscriberFactory;
25+
import com.google.cloud.pubsublite.proto.Cursor;
26+
import com.google.cloud.pubsublite.proto.SeekRequest;
27+
import org.apache.spark.sql.catalyst.InternalRow;
28+
import org.apache.spark.sql.sources.v2.reader.InputPartition;
29+
import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
30+
31+
public class PslMicroBatchInputPartition implements InputPartition<InternalRow> {
32+
33+
private final SubscriberFactory subscriberFactory;
34+
private final SparkPartitionOffset endOffset;
35+
private final SubscriptionPath subscriptionPath;
36+
private final FlowControlSettings flowControlSettings;
37+
38+
public PslMicroBatchInputPartition(
39+
SubscriptionPath subscriptionPath,
40+
FlowControlSettings flowControlSettings,
41+
SparkPartitionOffset endOffset,
42+
SubscriberFactory subscriberFactory) {
43+
this.endOffset = endOffset;
44+
this.subscriptionPath = subscriptionPath;
45+
this.flowControlSettings = flowControlSettings;
46+
this.subscriberFactory = subscriberFactory;
47+
}
48+
49+
@Override
50+
public InputPartitionReader<InternalRow> createPartitionReader() {
51+
BlockingPullSubscriber subscriber;
52+
try {
53+
subscriber =
54+
new BlockingPullSubscriberImpl(
55+
subscriberFactory,
56+
flowControlSettings,
57+
SeekRequest.newBuilder()
58+
.setCursor(Cursor.newBuilder().setOffset(endOffset.offset()).build())
59+
.build());
60+
} catch (CheckedApiException e) {
61+
throw new IllegalStateException(
62+
"Unable to create PSL subscriber for " + endOffset.partition(), e);
63+
}
64+
return new PslMicroBatchInputPartitionReader(subscriptionPath, endOffset, subscriber);
65+
}
66+
}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Copyright 2020 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.cloud.pubsublite.spark;
18+
19+
import com.google.cloud.pubsublite.SequencedMessage;
20+
import com.google.cloud.pubsublite.SubscriptionPath;
21+
import com.google.cloud.pubsublite.internal.BlockingPullSubscriber;
22+
import com.google.common.annotations.VisibleForTesting;
23+
import com.google.common.flogger.GoogleLogger;
24+
import java.time.Duration;
25+
import java.util.Optional;
26+
import java.util.concurrent.TimeUnit;
27+
import java.util.concurrent.TimeoutException;
28+
import javax.annotation.Nullable;
29+
import org.apache.spark.sql.catalyst.InternalRow;
30+
import org.apache.spark.sql.sources.v2.reader.InputPartitionReader;
31+
32+
public class PslMicroBatchInputPartitionReader implements InputPartitionReader<InternalRow> {
33+
private static final GoogleLogger log = GoogleLogger.forEnclosingClass();
34+
35+
private static final Duration SUBSCRIBER_PULL_TIMEOUT = Duration.ofSeconds(10);
36+
37+
private final SubscriptionPath subscriptionPath;
38+
private final SparkPartitionOffset endOffset;
39+
private final BlockingPullSubscriber subscriber;
40+
@Nullable private SequencedMessage currentMsg = null;
41+
private boolean batchFulfilled = false;
42+
43+
@VisibleForTesting
44+
PslMicroBatchInputPartitionReader(
45+
SubscriptionPath subscriptionPath,
46+
SparkPartitionOffset endOffset,
47+
BlockingPullSubscriber subscriber) {
48+
this.subscriptionPath = subscriptionPath;
49+
this.subscriber = subscriber;
50+
this.endOffset = endOffset;
51+
}
52+
53+
@Override
54+
public boolean next() {
55+
if (batchFulfilled) {
56+
return false;
57+
}
58+
Optional<SequencedMessage> msg;
59+
while (true) {
60+
try {
61+
subscriber.onData().get(SUBSCRIBER_PULL_TIMEOUT.getSeconds(), TimeUnit.SECONDS);
62+
msg = subscriber.messageIfAvailable();
63+
break;
64+
} catch (TimeoutException e) {
65+
log.atWarning().log("Unable to get any messages in last " + SUBSCRIBER_PULL_TIMEOUT);
66+
} catch (Throwable t) {
67+
throw new IllegalStateException("Failed to retrieve messages.", t);
68+
}
69+
}
70+
// since next() is only called on one thread at a time, we are sure that the message is
71+
// available to this thread.
72+
assert msg.isPresent();
73+
currentMsg = msg.get();
74+
if (currentMsg.offset().value() == endOffset.offset()) {
75+
// this is the last msg for the batch.
76+
batchFulfilled = true;
77+
} else if (currentMsg.offset().value() > endOffset.offset()) {
78+
batchFulfilled = true;
79+
return false;
80+
}
81+
return true;
82+
}
83+
84+
@Override
85+
public InternalRow get() {
86+
assert currentMsg != null;
87+
return PslSparkUtils.toInternalRow(currentMsg, subscriptionPath, endOffset.partition());
88+
}
89+
90+
@Override
91+
public void close() {
92+
try {
93+
subscriber.close();
94+
} catch (Exception e) {
95+
log.atWarning().log("Subscriber failed to close.");
96+
}
97+
}
98+
}

0 commit comments

Comments
 (0)