Skip to content

Commit dbfd6ca

Browse files
authored
Source MongoDB v2: Fixed nested document parsing (#7160)
* Fixed nested document parsing: added parsing of nested documents and arrays, added data types test
1 parent 27df558 commit dbfd6ca

File tree

6 files changed

+298
-62
lines changed

6 files changed

+298
-62
lines changed

airbyte-config/init/src/main/resources/config/STANDARD_SOURCE_DEFINITION/b2e713cd-cc36-4c0a-b5bd-b47cb8a0561e.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"sourceDefinitionId": "b2e713cd-cc36-4c0a-b5bd-b47cb8a0561e",
33
"name": "MongoDb",
44
"dockerRepository": "airbyte/source-mongodb-v2",
5-
"dockerImageTag": "0.1.2",
5+
"dockerImageTag": "0.1.3",
66
"documentationUrl": "https://docs.airbyte.io/integrations/sources/mongodb-v2",
77
"icon": "mongodb.svg"
88
}

airbyte-config/init/src/main/resources/seed/source_definitions.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,7 @@
550550
- sourceDefinitionId: b2e713cd-cc36-4c0a-b5bd-b47cb8a0561e
551551
name: MongoDb
552552
dockerRepository: airbyte/source-mongodb-v2
553-
dockerImageTag: 0.1.2
553+
dockerImageTag: 0.1.3
554554
documentationUrl: https://docs.airbyte.io/integrations/sources/mongodb-v2
555555
icon: mongodb.svg
556556
sourceType: database

airbyte-db/lib/src/main/java/io/airbyte/db/mongodb/MongoUtils.java

Lines changed: 89 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,18 @@
44

55
package io.airbyte.db.mongodb;
66

7+
import static org.bson.BsonType.ARRAY;
8+
import static org.bson.BsonType.DOCUMENT;
9+
710
import com.fasterxml.jackson.databind.JsonNode;
811
import com.fasterxml.jackson.databind.node.ObjectNode;
912
import com.google.api.client.util.DateTime;
13+
import com.google.common.collect.ImmutableMap;
14+
import com.google.common.collect.Lists;
1015
import com.mongodb.client.MongoCollection;
1116
import com.mongodb.client.MongoCursor;
1217
import io.airbyte.commons.json.Jsons;
18+
import io.airbyte.commons.util.MoreIterators;
1319
import io.airbyte.db.DataTypeUtils;
1420
import io.airbyte.protocol.models.JsonSchemaPrimitive;
1521
import java.util.Collections;
@@ -28,7 +34,6 @@
2834
import org.bson.BsonTimestamp;
2935
import org.bson.BsonType;
3036
import org.bson.Document;
31-
import org.bson.conversions.Bson;
3237
import org.bson.types.Decimal128;
3338
import org.bson.types.ObjectId;
3439
import org.bson.types.Symbol;
@@ -46,16 +51,16 @@ public static JsonSchemaPrimitive getType(final BsonType dataType) {
4651
return switch (dataType) {
4752
case BOOLEAN -> JsonSchemaPrimitive.BOOLEAN;
4853
case INT32, INT64, DOUBLE, DECIMAL128 -> JsonSchemaPrimitive.NUMBER;
49-
case STRING, SYMBOL, BINARY, DATE_TIME, TIMESTAMP, OBJECT_ID, REGULAR_EXPRESSION, JAVASCRIPT, JAVASCRIPT_WITH_SCOPE -> JsonSchemaPrimitive.STRING;
54+
case STRING, SYMBOL, BINARY, DATE_TIME, TIMESTAMP, OBJECT_ID, REGULAR_EXPRESSION, JAVASCRIPT -> JsonSchemaPrimitive.STRING;
5055
case ARRAY -> JsonSchemaPrimitive.ARRAY;
51-
case DOCUMENT -> JsonSchemaPrimitive.OBJECT;
56+
case DOCUMENT, JAVASCRIPT_WITH_SCOPE -> JsonSchemaPrimitive.OBJECT;
5257
default -> JsonSchemaPrimitive.STRING;
5358
};
5459
}
5560

5661
public static JsonNode toJsonNode(final Document document, final List<String> columnNames) {
5762
final ObjectNode objectNode = (ObjectNode) Jsons.jsonNode(Collections.emptyMap());
58-
readBson(document, objectNode, columnNames);
63+
formatDocument(document, objectNode, columnNames);
5964
return objectNode;
6065
}
6166

@@ -74,50 +79,93 @@ public static Object getBsonValue(final BsonType type, final String value) {
7479
default -> value;
7580
};
7681
} catch (final Exception e) {
77-
LOGGER.error("Failed to get BsonValue for field type " + type, e.getMessage());
82+
LOGGER.error(String.format("Failed to get BsonValue for field type %s", type), e.getMessage());
7883
return value;
7984
}
8085
}
8186

82-
private static void readBson(final Document document, final ObjectNode o, final List<String> columnNames) {
87+
private static void formatDocument(final Document document, final ObjectNode objectNode, final List<String> columnNames) {
8388
final BsonDocument bsonDocument = toBsonDocument(document);
8489
try (final BsonReader reader = new BsonDocumentReader(bsonDocument)) {
85-
reader.readStartDocument();
86-
while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
87-
final var fieldName = reader.readName();
88-
final var fieldType = reader.getCurrentBsonType();
89-
90-
switch (fieldType) {
91-
case BOOLEAN -> o.put(fieldName, reader.readBoolean());
92-
case INT32 -> o.put(fieldName, reader.readInt32());
93-
case INT64 -> o.put(fieldName, reader.readInt64());
94-
case DOUBLE -> o.put(fieldName, reader.readDouble());
95-
case DECIMAL128 -> o.put(fieldName, toDouble(reader.readDecimal128()));
96-
case TIMESTAMP -> o.put(fieldName, toString(reader.readTimestamp()));
97-
case DATE_TIME -> o.put(fieldName, DataTypeUtils.toISO8601String(reader.readDateTime()));
98-
case BINARY -> o.put(fieldName, toByteArray(reader.readBinaryData()));
99-
case SYMBOL -> o.put(fieldName, reader.readSymbol());
100-
case STRING -> o.put(fieldName, reader.readString());
101-
case OBJECT_ID -> o.put(fieldName, toString(reader.readObjectId()));
102-
case JAVASCRIPT -> o.put(fieldName, reader.readJavaScript());
103-
case JAVASCRIPT_WITH_SCOPE -> o.put(fieldName, reader.readJavaScriptWithScope());
104-
case REGULAR_EXPRESSION -> o.put(fieldName, toString(reader.readRegularExpression()));
105-
case DOCUMENT -> o.put(fieldName, documentToString(document.get(fieldName), reader));
106-
case ARRAY -> o.put(fieldName, arrayToString(document.get(fieldName), reader));
107-
default -> reader.skipValue();
108-
}
109-
110-
if (columnNames.contains(fieldName + AIRBYTE_SUFFIX)) {
111-
o.put(fieldName, o.get(fieldName).asText());
112-
}
113-
}
114-
reader.readEndDocument();
90+
readDocument(reader, objectNode, columnNames);
11591
} catch (final Exception e) {
11692
LOGGER.error("Exception while parsing BsonDocument: ", e.getMessage());
11793
throw new RuntimeException(e);
11894
}
11995
}
12096

97+
private static ObjectNode readDocument(final BsonReader reader, final ObjectNode jsonNodes, final List<String> columnNames) {
98+
reader.readStartDocument();
99+
while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
100+
final var fieldName = reader.readName();
101+
final var fieldType = reader.getCurrentBsonType();
102+
if (DOCUMENT.equals(fieldType)) {
103+
// recursion in used to parse inner documents
104+
jsonNodes.set(fieldName, readDocument(reader, (ObjectNode) Jsons.jsonNode(Collections.emptyMap()), columnNames));
105+
} else if (ARRAY.equals(fieldType)) {
106+
jsonNodes.set(fieldName, readArray(reader, columnNames, fieldName));
107+
} else {
108+
readField(reader, jsonNodes, columnNames, fieldName, fieldType);
109+
}
110+
transformToStringIfMarked(jsonNodes, columnNames, fieldName);
111+
}
112+
reader.readEndDocument();
113+
114+
return jsonNodes;
115+
}
116+
117+
private static void transformToStringIfMarked(final ObjectNode jsonNodes, final List<String> columnNames, final String fieldName) {
118+
if (columnNames.contains(fieldName + AIRBYTE_SUFFIX)) {
119+
jsonNodes.put(fieldName, jsonNodes.get(fieldName).asText());
120+
}
121+
}
122+
123+
private static JsonNode readArray(final BsonReader reader, final List<String> columnNames, final String fieldName) {
124+
reader.readStartArray();
125+
final var elements = Lists.newArrayList();
126+
127+
while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
128+
final var arrayFieldType = reader.getCurrentBsonType();
129+
if (DOCUMENT.equals(arrayFieldType)) {
130+
// recursion is used to read inner doc
131+
elements.add(readDocument(reader, (ObjectNode) Jsons.jsonNode(Collections.emptyMap()), columnNames));
132+
} else if (ARRAY.equals(arrayFieldType)) {
133+
// recursion is used to read inner array
134+
elements.add(readArray(reader, columnNames, fieldName));
135+
} else {
136+
final var element = readField(reader, (ObjectNode) Jsons.jsonNode(Collections.emptyMap()), columnNames, fieldName, arrayFieldType);
137+
elements.add(element.get(fieldName));
138+
}
139+
}
140+
reader.readEndArray();
141+
return Jsons.jsonNode(MoreIterators.toList(elements.iterator()));
142+
}
143+
144+
private static ObjectNode readField(final BsonReader reader,
145+
final ObjectNode o,
146+
final List<String> columnNames,
147+
final String fieldName,
148+
final BsonType fieldType) {
149+
switch (fieldType) {
150+
case BOOLEAN -> o.put(fieldName, reader.readBoolean());
151+
case INT32 -> o.put(fieldName, reader.readInt32());
152+
case INT64 -> o.put(fieldName, reader.readInt64());
153+
case DOUBLE -> o.put(fieldName, reader.readDouble());
154+
case DECIMAL128 -> o.put(fieldName, toDouble(reader.readDecimal128()));
155+
case TIMESTAMP -> o.put(fieldName, DataTypeUtils.toISO8601String(reader.readTimestamp().getValue()));
156+
case DATE_TIME -> o.put(fieldName, DataTypeUtils.toISO8601String(reader.readDateTime()));
157+
case BINARY -> o.put(fieldName, toByteArray(reader.readBinaryData()));
158+
case SYMBOL -> o.put(fieldName, reader.readSymbol());
159+
case STRING -> o.put(fieldName, reader.readString());
160+
case OBJECT_ID -> o.put(fieldName, toString(reader.readObjectId()));
161+
case JAVASCRIPT -> o.put(fieldName, reader.readJavaScript());
162+
case JAVASCRIPT_WITH_SCOPE -> readJavaScriptWithScope(o, reader, fieldName, columnNames);
163+
case REGULAR_EXPRESSION -> toString(reader.readRegularExpression());
164+
default -> reader.skipValue();
165+
}
166+
return o;
167+
}
168+
121169
/**
122170
* Gets 10.000 documents from collection, gathers all unique fields and its type. In case when one
123171
* field has different types in 2 and more documents, the type is set to String.
@@ -151,7 +199,7 @@ public static Map<String, BsonType> getUniqueFields(final MongoCollection<Docume
151199

152200
private static BsonDocument toBsonDocument(final Document document) {
153201
try {
154-
return document.toBsonDocument(BsonDocument.class, Bson.DEFAULT_CODEC_REGISTRY);
202+
return document.toBsonDocument();
155203
} catch (final Exception e) {
156204
LOGGER.error("Exception while converting Document to BsonDocument: ", e.getMessage());
157205
throw new RuntimeException(e);
@@ -170,27 +218,10 @@ private static byte[] toByteArray(final BsonBinary value) {
170218
return value == null ? null : value.getData();
171219
}
172220

173-
// temporary method for MVP
174-
private static String documentToString(final Object obj, final BsonReader reader) {
175-
try {
176-
reader.skipValue();
177-
final Document document = (Document) obj;
178-
return document.toJson();
179-
} catch (final Exception e) {
180-
LOGGER.error("Failed to convert document to a String: ", e.getMessage());
181-
return null;
182-
}
183-
}
184-
185-
// temporary method for MVP
186-
private static String arrayToString(final Object obj, final BsonReader reader) {
187-
try {
188-
reader.skipValue();
189-
return obj.toString();
190-
} catch (final Exception e) {
191-
LOGGER.error("Failed to convert array to a String: ", e.getMessage());
192-
return null;
193-
}
221+
private static void readJavaScriptWithScope(ObjectNode o, BsonReader reader, String fieldName, List<String> columnNames) {
222+
var code = reader.readJavaScriptWithScope();
223+
var scope = readDocument(reader, (ObjectNode) Jsons.jsonNode(Collections.emptyMap()), columnNames);
224+
o.set(fieldName, Jsons.jsonNode(ImmutableMap.of("code", code, "scope", scope)));
194225
}
195226

196227
public enum MongoInstanceType {

airbyte-integrations/connectors/source-mongodb-v2/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ COPY build/distributions/${APPLICATION}*.tar ${APPLICATION}.tar
88

99
RUN tar xf ${APPLICATION}.tar --strip-components=1
1010

11-
LABEL io.airbyte.version=0.1.2
11+
LABEL io.airbyte.version=0.1.3
1212
LABEL io.airbyte.name=airbyte/source-mongodb-v2

0 commit comments

Comments
 (0)