44
55package io .airbyte .db .mongodb ;
66
7+ import static org .bson .BsonType .ARRAY ;
8+ import static org .bson .BsonType .DOCUMENT ;
9+
710import com .fasterxml .jackson .databind .JsonNode ;
811import com .fasterxml .jackson .databind .node .ObjectNode ;
912import com .google .api .client .util .DateTime ;
13+ import com .google .common .collect .ImmutableMap ;
14+ import com .google .common .collect .Lists ;
1015import com .mongodb .client .MongoCollection ;
1116import com .mongodb .client .MongoCursor ;
1217import io .airbyte .commons .json .Jsons ;
18+ import io .airbyte .commons .util .MoreIterators ;
1319import io .airbyte .db .DataTypeUtils ;
1420import io .airbyte .protocol .models .JsonSchemaPrimitive ;
1521import java .util .Collections ;
2834import org .bson .BsonTimestamp ;
2935import org .bson .BsonType ;
3036import org .bson .Document ;
31- import org .bson .conversions .Bson ;
3237import org .bson .types .Decimal128 ;
3338import org .bson .types .ObjectId ;
3439import org .bson .types .Symbol ;
@@ -46,16 +51,16 @@ public static JsonSchemaPrimitive getType(final BsonType dataType) {
4651 return switch (dataType ) {
4752 case BOOLEAN -> JsonSchemaPrimitive .BOOLEAN ;
4853 case INT32 , INT64 , DOUBLE , DECIMAL128 -> JsonSchemaPrimitive .NUMBER ;
49- case STRING , SYMBOL , BINARY , DATE_TIME , TIMESTAMP , OBJECT_ID , REGULAR_EXPRESSION , JAVASCRIPT , JAVASCRIPT_WITH_SCOPE -> JsonSchemaPrimitive .STRING ;
54+ case STRING , SYMBOL , BINARY , DATE_TIME , TIMESTAMP , OBJECT_ID , REGULAR_EXPRESSION , JAVASCRIPT -> JsonSchemaPrimitive .STRING ;
5055 case ARRAY -> JsonSchemaPrimitive .ARRAY ;
51- case DOCUMENT -> JsonSchemaPrimitive .OBJECT ;
56+ case DOCUMENT , JAVASCRIPT_WITH_SCOPE -> JsonSchemaPrimitive .OBJECT ;
5257 default -> JsonSchemaPrimitive .STRING ;
5358 };
5459 }
5560
5661 public static JsonNode toJsonNode (final Document document , final List <String > columnNames ) {
5762 final ObjectNode objectNode = (ObjectNode ) Jsons .jsonNode (Collections .emptyMap ());
58- readBson (document , objectNode , columnNames );
63+ formatDocument (document , objectNode , columnNames );
5964 return objectNode ;
6065 }
6166
@@ -74,50 +79,93 @@ public static Object getBsonValue(final BsonType type, final String value) {
7479 default -> value ;
7580 };
7681 } catch (final Exception e ) {
77- LOGGER .error ("Failed to get BsonValue for field type " + type , e .getMessage ());
82+ LOGGER .error (String . format ( "Failed to get BsonValue for field type %s" , type ) , e .getMessage ());
7883 return value ;
7984 }
8085 }
8186
82- private static void readBson (final Document document , final ObjectNode o , final List <String > columnNames ) {
87+ private static void formatDocument (final Document document , final ObjectNode objectNode , final List <String > columnNames ) {
8388 final BsonDocument bsonDocument = toBsonDocument (document );
8489 try (final BsonReader reader = new BsonDocumentReader (bsonDocument )) {
85- reader .readStartDocument ();
86- while (reader .readBsonType () != BsonType .END_OF_DOCUMENT ) {
87- final var fieldName = reader .readName ();
88- final var fieldType = reader .getCurrentBsonType ();
89-
90- switch (fieldType ) {
91- case BOOLEAN -> o .put (fieldName , reader .readBoolean ());
92- case INT32 -> o .put (fieldName , reader .readInt32 ());
93- case INT64 -> o .put (fieldName , reader .readInt64 ());
94- case DOUBLE -> o .put (fieldName , reader .readDouble ());
95- case DECIMAL128 -> o .put (fieldName , toDouble (reader .readDecimal128 ()));
96- case TIMESTAMP -> o .put (fieldName , toString (reader .readTimestamp ()));
97- case DATE_TIME -> o .put (fieldName , DataTypeUtils .toISO8601String (reader .readDateTime ()));
98- case BINARY -> o .put (fieldName , toByteArray (reader .readBinaryData ()));
99- case SYMBOL -> o .put (fieldName , reader .readSymbol ());
100- case STRING -> o .put (fieldName , reader .readString ());
101- case OBJECT_ID -> o .put (fieldName , toString (reader .readObjectId ()));
102- case JAVASCRIPT -> o .put (fieldName , reader .readJavaScript ());
103- case JAVASCRIPT_WITH_SCOPE -> o .put (fieldName , reader .readJavaScriptWithScope ());
104- case REGULAR_EXPRESSION -> o .put (fieldName , toString (reader .readRegularExpression ()));
105- case DOCUMENT -> o .put (fieldName , documentToString (document .get (fieldName ), reader ));
106- case ARRAY -> o .put (fieldName , arrayToString (document .get (fieldName ), reader ));
107- default -> reader .skipValue ();
108- }
109-
110- if (columnNames .contains (fieldName + AIRBYTE_SUFFIX )) {
111- o .put (fieldName , o .get (fieldName ).asText ());
112- }
113- }
114- reader .readEndDocument ();
90+ readDocument (reader , objectNode , columnNames );
11591 } catch (final Exception e ) {
11692 LOGGER .error ("Exception while parsing BsonDocument: " , e .getMessage ());
11793 throw new RuntimeException (e );
11894 }
11995 }
12096
97+ private static ObjectNode readDocument (final BsonReader reader , final ObjectNode jsonNodes , final List <String > columnNames ) {
98+ reader .readStartDocument ();
99+ while (reader .readBsonType () != BsonType .END_OF_DOCUMENT ) {
100+ final var fieldName = reader .readName ();
101+ final var fieldType = reader .getCurrentBsonType ();
102+ if (DOCUMENT .equals (fieldType )) {
103+ // recursion in used to parse inner documents
104+ jsonNodes .set (fieldName , readDocument (reader , (ObjectNode ) Jsons .jsonNode (Collections .emptyMap ()), columnNames ));
105+ } else if (ARRAY .equals (fieldType )) {
106+ jsonNodes .set (fieldName , readArray (reader , columnNames , fieldName ));
107+ } else {
108+ readField (reader , jsonNodes , columnNames , fieldName , fieldType );
109+ }
110+ transformToStringIfMarked (jsonNodes , columnNames , fieldName );
111+ }
112+ reader .readEndDocument ();
113+
114+ return jsonNodes ;
115+ }
116+
117+ private static void transformToStringIfMarked (final ObjectNode jsonNodes , final List <String > columnNames , final String fieldName ) {
118+ if (columnNames .contains (fieldName + AIRBYTE_SUFFIX )) {
119+ jsonNodes .put (fieldName , jsonNodes .get (fieldName ).asText ());
120+ }
121+ }
122+
123+ private static JsonNode readArray (final BsonReader reader , final List <String > columnNames , final String fieldName ) {
124+ reader .readStartArray ();
125+ final var elements = Lists .newArrayList ();
126+
127+ while (reader .readBsonType () != BsonType .END_OF_DOCUMENT ) {
128+ final var arrayFieldType = reader .getCurrentBsonType ();
129+ if (DOCUMENT .equals (arrayFieldType )) {
130+ // recursion is used to read inner doc
131+ elements .add (readDocument (reader , (ObjectNode ) Jsons .jsonNode (Collections .emptyMap ()), columnNames ));
132+ } else if (ARRAY .equals (arrayFieldType )) {
133+ // recursion is used to read inner array
134+ elements .add (readArray (reader , columnNames , fieldName ));
135+ } else {
136+ final var element = readField (reader , (ObjectNode ) Jsons .jsonNode (Collections .emptyMap ()), columnNames , fieldName , arrayFieldType );
137+ elements .add (element .get (fieldName ));
138+ }
139+ }
140+ reader .readEndArray ();
141+ return Jsons .jsonNode (MoreIterators .toList (elements .iterator ()));
142+ }
143+
144+ private static ObjectNode readField (final BsonReader reader ,
145+ final ObjectNode o ,
146+ final List <String > columnNames ,
147+ final String fieldName ,
148+ final BsonType fieldType ) {
149+ switch (fieldType ) {
150+ case BOOLEAN -> o .put (fieldName , reader .readBoolean ());
151+ case INT32 -> o .put (fieldName , reader .readInt32 ());
152+ case INT64 -> o .put (fieldName , reader .readInt64 ());
153+ case DOUBLE -> o .put (fieldName , reader .readDouble ());
154+ case DECIMAL128 -> o .put (fieldName , toDouble (reader .readDecimal128 ()));
155+ case TIMESTAMP -> o .put (fieldName , DataTypeUtils .toISO8601String (reader .readTimestamp ().getValue ()));
156+ case DATE_TIME -> o .put (fieldName , DataTypeUtils .toISO8601String (reader .readDateTime ()));
157+ case BINARY -> o .put (fieldName , toByteArray (reader .readBinaryData ()));
158+ case SYMBOL -> o .put (fieldName , reader .readSymbol ());
159+ case STRING -> o .put (fieldName , reader .readString ());
160+ case OBJECT_ID -> o .put (fieldName , toString (reader .readObjectId ()));
161+ case JAVASCRIPT -> o .put (fieldName , reader .readJavaScript ());
162+ case JAVASCRIPT_WITH_SCOPE -> readJavaScriptWithScope (o , reader , fieldName , columnNames );
163+ case REGULAR_EXPRESSION -> toString (reader .readRegularExpression ());
164+ default -> reader .skipValue ();
165+ }
166+ return o ;
167+ }
168+
121169 /**
122170 * Gets 10.000 documents from collection, gathers all unique fields and its type. In case when one
123171 * field has different types in 2 and more documents, the type is set to String.
@@ -151,7 +199,7 @@ public static Map<String, BsonType> getUniqueFields(final MongoCollection<Docume
151199
152200 private static BsonDocument toBsonDocument (final Document document ) {
153201 try {
154- return document .toBsonDocument (BsonDocument . class , Bson . DEFAULT_CODEC_REGISTRY );
202+ return document .toBsonDocument ();
155203 } catch (final Exception e ) {
156204 LOGGER .error ("Exception while converting Document to BsonDocument: " , e .getMessage ());
157205 throw new RuntimeException (e );
@@ -170,27 +218,10 @@ private static byte[] toByteArray(final BsonBinary value) {
170218 return value == null ? null : value .getData ();
171219 }
172220
173- // temporary method for MVP
174- private static String documentToString (final Object obj , final BsonReader reader ) {
175- try {
176- reader .skipValue ();
177- final Document document = (Document ) obj ;
178- return document .toJson ();
179- } catch (final Exception e ) {
180- LOGGER .error ("Failed to convert document to a String: " , e .getMessage ());
181- return null ;
182- }
183- }
184-
185- // temporary method for MVP
186- private static String arrayToString (final Object obj , final BsonReader reader ) {
187- try {
188- reader .skipValue ();
189- return obj .toString ();
190- } catch (final Exception e ) {
191- LOGGER .error ("Failed to convert array to a String: " , e .getMessage ());
192- return null ;
193- }
221+ private static void readJavaScriptWithScope (ObjectNode o , BsonReader reader , String fieldName , List <String > columnNames ) {
222+ var code = reader .readJavaScriptWithScope ();
223+ var scope = readDocument (reader , (ObjectNode ) Jsons .jsonNode (Collections .emptyMap ()), columnNames );
224+ o .set (fieldName , Jsons .jsonNode (ImmutableMap .of ("code" , code , "scope" , scope )));
194225 }
195226
196227 public enum MongoInstanceType {
0 commit comments