Skip to content

Commit 39308c4

Browse files
committed
PYTHON-721 - Add unicode_decode_error_handler to CodecOptions.
1 parent 70be0d8 commit 39308c4

File tree

5 files changed

+147
-25
lines changed

5 files changed

+147
-25
lines changed

bson/__init__.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,11 @@ def _get_int(data, position, dummy0, dummy1):
9898
return _UNPACK_INT(data[position:end])[0], end
9999

100100

101-
def _get_c_string(data, position):
101+
def _get_c_string(data, position, opts):
102102
"""Decode a BSON 'C' string to python unicode string."""
103103
end = data.index(b"\x00", position)
104-
return _utf_8_decode(data[position:end], None, True)[0], end + 1
104+
return _utf_8_decode(data[position:end],
105+
opts.unicode_decode_error_handler, True)[0], end + 1
105106

106107

107108
def _get_float(data, position, dummy0, dummy1):
@@ -110,7 +111,7 @@ def _get_float(data, position, dummy0, dummy1):
110111
return _UNPACK_FLOAT(data[position:end])[0], end
111112

112113

113-
def _get_string(data, position, obj_end, dummy):
114+
def _get_string(data, position, obj_end, opts):
114115
"""Decode a BSON string to python unicode string."""
115116
length = _UNPACK_INT(data[position:position + 4])[0]
116117
position += 4
@@ -119,7 +120,8 @@ def _get_string(data, position, obj_end, dummy):
119120
end = position + length - 1
120121
if data[end:end + 1] != b"\x00":
121122
raise InvalidBSON("invalid end of string")
122-
return _utf_8_decode(data[position:end], None, True)[0], end + 1
123+
return _utf_8_decode(data[position:end],
124+
opts.unicode_decode_error_handler, True)[0], end + 1
123125

124126

125127
def _get_object(data, position, obj_end, opts):
@@ -235,10 +237,10 @@ def _get_code_w_scope(data, position, obj_end, opts):
235237
return Code(code, scope), position
236238

237239

238-
def _get_regex(data, position, dummy0, dummy1):
240+
def _get_regex(data, position, dummy0, opts):
239241
"""Decode a BSON regex to bson.regex.Regex or a python pattern object."""
240-
pattern, position = _get_c_string(data, position)
241-
bson_flags, position = _get_c_string(data, position)
242+
pattern, position = _get_c_string(data, position, opts)
243+
bson_flags, position = _get_c_string(data, position, opts)
242244
bson_re = Regex(pattern, bson_flags)
243245
return bson_re, position
244246

@@ -295,7 +297,7 @@ def _element_to_dict(data, position, obj_end, opts):
295297
"""Decode a single key, value pair."""
296298
element_type = data[position:position + 1]
297299
position += 1
298-
element_name, position = _get_c_string(data, position)
300+
element_name, position = _get_c_string(data, position, opts)
299301
value, position = _ELEMENT_GETTER[element_type](data,
300302
position, obj_end, opts)
301303
return element_name, value, position

bson/_cbsonmodule.c

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,12 @@ _downcast_and_check(Py_ssize_t size, int extra) {
118118
*/
119119
int convert_codec_options(PyObject* options_obj, void* p) {
120120
codec_options_t* options = (codec_options_t*)p;
121-
if (!PyArg_ParseTuple(options_obj, "Obb",
121+
options->unicode_decode_error_handler = NULL;
122+
if (!PyArg_ParseTuple(options_obj, "Obbz",
122123
&options->document_class,
123124
&options->tz_aware,
124-
&options->uuid_rep)) {
125+
&options->uuid_rep,
126+
&options->unicode_decode_error_handler)) {
125127
return 0;
126128
}
127129

@@ -137,6 +139,7 @@ void default_codec_options(codec_options_t* options) {
137139
// TODO: set to "1". PYTHON-526, setting tz_aware=True by default.
138140
options->tz_aware = 0;
139141
options->uuid_rep = PYTHON_LEGACY;
142+
options->unicode_decode_error_handler = NULL;
140143
}
141144

142145
void destroy_codec_options(codec_options_t* options) {
@@ -1560,7 +1563,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
15601563
if (buffer[*position + value_length - 1]) {
15611564
goto invalid;
15621565
}
1563-
value = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict");
1566+
value = PyUnicode_DecodeUTF8(
1567+
buffer + *position, value_length - 1,
1568+
options->unicode_decode_error_handler);
15641569
if (!value) {
15651570
goto invalid;
15661571
}
@@ -1916,7 +1921,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
19161921
if (pattern_length > BSON_MAX_SIZE || max < pattern_length) {
19171922
goto invalid;
19181923
}
1919-
pattern = PyUnicode_DecodeUTF8(buffer + *position, pattern_length, "strict");
1924+
pattern = PyUnicode_DecodeUTF8(
1925+
buffer + *position, pattern_length,
1926+
options->unicode_decode_error_handler);
19201927
if (!pattern) {
19211928
goto invalid;
19221929
}
@@ -1980,8 +1987,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
19801987
goto invalid;
19811988
}
19821989

1983-
collection = PyUnicode_DecodeUTF8(buffer + *position,
1984-
coll_length - 1, "strict");
1990+
collection = PyUnicode_DecodeUTF8(
1991+
buffer + *position, coll_length - 1,
1992+
options->unicode_decode_error_handler);
19851993
if (!collection) {
19861994
goto invalid;
19871995
}
@@ -2026,7 +2034,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
20262034
if (buffer[*position + value_length - 1]) {
20272035
goto invalid;
20282036
}
2029-
code = PyUnicode_DecodeUTF8(buffer + *position, value_length - 1, "strict");
2037+
code = PyUnicode_DecodeUTF8(
2038+
buffer + *position, value_length - 1,
2039+
options->unicode_decode_error_handler);
20302040
if (!code) {
20312041
goto invalid;
20322042
}
@@ -2068,7 +2078,9 @@ static PyObject* get_value(PyObject* self, const char* buffer,
20682078
if (buffer[*position + code_size - 1]) {
20692079
goto invalid;
20702080
}
2071-
code = PyUnicode_DecodeUTF8(buffer + *position, code_size - 1, "strict");
2081+
code = PyUnicode_DecodeUTF8(
2082+
buffer + *position, code_size - 1,
2083+
options->unicode_decode_error_handler);
20722084
if (!code) {
20732085
goto invalid;
20742086
}
@@ -2261,8 +2273,29 @@ static PyObject* _elements_to_dict(PyObject* self, const char* string,
22612273
Py_DECREF(dict);
22622274
return NULL;
22632275
}
2264-
name = PyUnicode_DecodeUTF8(string + position, name_length, "strict");
2276+
name = PyUnicode_DecodeUTF8(
2277+
string + position, name_length,
2278+
options->unicode_decode_error_handler);
22652279
if (!name) {
2280+
/* If NULL is returned then wrap the UnicodeDecodeError
2281+
in an InvalidBSON error */
2282+
PyObject *etype, *evalue, *etrace;
2283+
PyObject *InvalidBSON;
2284+
2285+
PyErr_Fetch(&etype, &evalue, &etrace);
2286+
InvalidBSON = _error("InvalidBSON");
2287+
if (InvalidBSON) {
2288+
Py_DECREF(etype);
2289+
etype = InvalidBSON;
2290+
2291+
if (evalue) {
2292+
PyObject *msg = PyObject_Str(evalue);
2293+
Py_DECREF(evalue);
2294+
evalue = msg;
2295+
}
2296+
PyErr_NormalizeException(&etype, &evalue, &etrace);
2297+
}
2298+
PyErr_Restore(etype, evalue, etrace);
22662299
Py_DECREF(dict);
22672300
return NULL;
22682301
}

bson/_cbsonmodule.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ typedef struct codec_options_t {
5656
PyObject* document_class;
5757
unsigned char tz_aware;
5858
unsigned char uuid_rep;
59+
char* unicode_decode_error_handler;
5960
} codec_options_t;
6061

6162
/* C API functions */

bson/codec_options.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@
1616

1717
from collections import MutableMapping, namedtuple
1818

19+
from bson.py3compat import string_type
1920
from bson.binary import (ALL_UUID_REPRESENTATIONS,
2021
PYTHON_LEGACY,
2122
UUID_REPRESENTATION_NAMES)
2223

2324

2425
_options_base = namedtuple(
25-
'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation'))
26+
'CodecOptions', ('document_class', 'tz_aware', 'uuid_representation',
27+
'unicode_decode_error_handler'))
2628

2729

2830
class CodecOptions(_options_base):
@@ -38,10 +40,21 @@ class CodecOptions(_options_base):
3840
- `uuid_representation`: The BSON representation to use when encoding
3941
and decoding instances of :class:`~uuid.UUID`. Defaults to
4042
:data:`~bson.binary.PYTHON_LEGACY`.
43+
- `unicode_decode_error_handler`: The error handler to use when decoding
44+
an invalid BSON string. Valid options include 'strict', 'replace', and
45+
'ignore'. Defaults to 'strict'.
46+
47+
.. warning:: Care must be taken when changing
48+
`unicode_decode_error_handler` from its default value ('strict').
49+
The 'replace' and 'ignore' modes should not be used when documents
50+
retrieved from the server will be modified in the client application
51+
and stored back to the server.
52+
4153
"""
4254

4355
def __new__(cls, document_class=dict,
44-
tz_aware=False, uuid_representation=PYTHON_LEGACY):
56+
tz_aware=False, uuid_representation=PYTHON_LEGACY,
57+
unicode_decode_error_handler="strict"):
4558
if not issubclass(document_class, MutableMapping):
4659
raise TypeError("document_class must be dict, bson.son.SON, or "
4760
"another subclass of collections.MutableMapping")
@@ -50,9 +63,12 @@ def __new__(cls, document_class=dict,
5063
if uuid_representation not in ALL_UUID_REPRESENTATIONS:
5164
raise ValueError("uuid_representation must be a value "
5265
"from bson.binary.ALL_UUID_REPRESENTATIONS")
53-
66+
if not isinstance(unicode_decode_error_handler, (string_type, None)):
67+
raise ValueError("unicode_decode_error_handler must be a string "
68+
"or None")
5469
return tuple.__new__(
55-
cls, (document_class, tz_aware, uuid_representation))
70+
cls, (document_class, tz_aware, uuid_representation,
71+
unicode_decode_error_handler))
5672

5773
def __repr__(self):
5874
document_class_repr = (
@@ -64,7 +80,9 @@ def __repr__(self):
6480

6581
return (
6682
'CodecOptions(document_class=%s, tz_aware=%r, uuid_representation='
67-
'%s)' % (document_class_repr, self.tz_aware, uuid_rep_repr))
83+
'%s, unicode_decode_error_handler=%r)' %
84+
(document_class_repr, self.tz_aware, uuid_rep_repr,
85+
self.unicode_decode_error_handler))
6886

6987

7088
DEFAULT_CODEC_OPTIONS = CodecOptions()
@@ -78,4 +96,6 @@ def _parse_codec_options(options):
7896
tz_aware=options.get(
7997
'tz_aware', DEFAULT_CODEC_OPTIONS.tz_aware),
8098
uuid_representation=options.get(
81-
'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation))
99+
'uuidrepresentation', DEFAULT_CODEC_OPTIONS.uuid_representation),
100+
unicode_decode_error_handler=options.get(
101+
'unicode_decode_error_handler', "strict"))

test/test_bson.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -785,8 +785,9 @@ def test_uuid_representation(self):
785785
self.assertRaises(ValueError, CodecOptions, uuid_representation=2)
786786

787787
def test_codec_options_repr(self):
788-
r = ('CodecOptions(document_class=dict, tz_aware=False, '
789-
'uuid_representation=PYTHON_LEGACY)')
788+
r = ("CodecOptions(document_class=dict, tz_aware=False, "
789+
"uuid_representation=PYTHON_LEGACY, "
790+
"unicode_decode_error_handler='strict')")
790791
self.assertEqual(r, repr(CodecOptions()))
791792

792793
def test_decode_all_defaults(self):
@@ -803,6 +804,71 @@ def test_decode_all_defaults(self):
803804
self.assertEqual(decoded['uuid'], doc['uuid'])
804805
self.assertIsNone(decoded['dt'].tzinfo)
805806

807+
def test_unicode_decode_error_handler(self):
808+
enc = BSON.encode({"keystr": "foobar"})
809+
810+
# Test handling of bad key value.
811+
invalid_key = BSON(enc[:7] + b'\xe9' + enc[8:])
812+
replaced_key = b'ke\xef\xbf\xbdstr'.decode('utf-8')
813+
814+
dec = BSON.decode(invalid_key, CodecOptions(
815+
unicode_decode_error_handler="replace"))
816+
self.assertEqual(dec, {replaced_key: u("foobar")})
817+
818+
dec = BSON.decode(invalid_key, CodecOptions(
819+
unicode_decode_error_handler="ignore"))
820+
self.assertEqual(dec, {"kestr": "foobar"})
821+
822+
self.assertRaises(InvalidBSON, BSON.decode, invalid_key, CodecOptions(
823+
unicode_decode_error_handler="strict"))
824+
self.assertRaises(InvalidBSON, BSON.decode, invalid_key,
825+
CodecOptions())
826+
self.assertRaises(InvalidBSON, BSON.decode, invalid_key)
827+
828+
# Test handing of bad string value.
829+
invalid_val = BSON(enc[:18] + b'\xe9' + enc[19:])
830+
replaced_val = b'fo\xef\xbf\xbdbar'.decode('utf-8')
831+
832+
dec = BSON.decode(invalid_val, CodecOptions(
833+
unicode_decode_error_handler="replace"))
834+
self.assertEqual(dec, {"keystr": replaced_val})
835+
836+
dec = BSON.decode(invalid_val, CodecOptions(
837+
unicode_decode_error_handler="ignore"))
838+
self.assertEqual(dec, {"keystr": "fobar"})
839+
840+
self.assertRaises(InvalidBSON, BSON.decode, invalid_val, CodecOptions(
841+
unicode_decode_error_handler="strict"))
842+
self.assertRaises(InvalidBSON, BSON.decode, invalid_val,
843+
CodecOptions())
844+
self.assertRaises(InvalidBSON, BSON.decode, invalid_val)
845+
846+
# Test handing bad key + bad value.
847+
invalid_both = BSON(
848+
enc[:7] + b'\xe9' + enc[8:18] + b'\xe9' + enc[19:])
849+
850+
dec = BSON.decode(invalid_both, CodecOptions(
851+
unicode_decode_error_handler="replace"))
852+
self.assertEqual(dec, {replaced_key: replaced_val})
853+
854+
dec = BSON.decode(invalid_both, CodecOptions(
855+
unicode_decode_error_handler="ignore"))
856+
self.assertEqual(dec, {"kestr": "fobar"})
857+
858+
self.assertRaises(InvalidBSON, BSON.decode, invalid_both, CodecOptions(
859+
unicode_decode_error_handler="strict"))
860+
self.assertRaises(InvalidBSON, BSON.decode, invalid_both,
861+
CodecOptions())
862+
self.assertRaises(InvalidBSON, BSON.decode, invalid_both)
863+
864+
# Test handling bad error mode.
865+
dec = BSON.decode(enc, CodecOptions(
866+
unicode_decode_error_handler="junk"))
867+
self.assertEqual(dec, {"keystr": "foobar"})
868+
869+
self.assertRaises(InvalidBSON, BSON.decode, invalid_both,
870+
CodecOptions(unicode_decode_error_handler="junk"))
871+
806872

807873
if __name__ == "__main__":
808874
unittest.main()

0 commit comments

Comments
 (0)