Skip to content

Commit e113a33

Browse files
stephan-hofbehackett
authored andcommitted
Improve the check for legal utf8 in the bson module.
Now python and bson have the same understanding of legal utf8.
1 parent cb34e31 commit e113a33

File tree

2 files changed

+84
-1
lines changed

2 files changed

+84
-1
lines changed

bson/encoding_helpers.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ static unsigned char isLegalUTF8(const unsigned char* source, int length) {
7878
/* no fall-through in this inner switch */
7979
case 0xE0: if (a < 0xA0) return 0; break;
8080
case 0xF0: if (a < 0x90) return 0; break;
81-
case 0xF4: if (a > 0x8F) return 0; break;
81+
case 0xF4: if ((a > 0x8F) || (a < 0x80)) return 0; break;
8282
default: if (a < 0x80) return 0;
8383
}
8484
case 1: if (*source >= 0x80 && *source < 0xC2) return 0;

test/test_bson.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,89 @@ def test_utf8(self):
650650
z = {iso8859_bytes: "hello"}
651651
self.assertRaises(InvalidStringData, BSON.encode, z)
652652

653+
# Verify that python and bson have the same understanding of
654+
# legal utf-8 if the first byte is 0xf4 (244)
655+
@staticmethod
656+
def _py_is_legal_utf8(x):
657+
try:
658+
x.decode('utf-8')
659+
return True
660+
except UnicodeDecodeError:
661+
return False
662+
663+
@staticmethod
664+
def _bson_is_legal_utf8(x):
665+
try:
666+
BSON.encode({'x': x})
667+
return True
668+
except InvalidStringData:
669+
return False
670+
671+
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
672+
def test_legal_utf8_full_coverage(self):
673+
# this tests takes 400 seconds. Which is too long to run each time.
674+
# However it is the only one which covers all possible bit combinations
675+
# in the 244 space.
676+
677+
b1 = chr(0xf4)
678+
679+
for b2 in map(chr, range(255)):
680+
m2 = b1 + b2
681+
self.assertEqual(
682+
self._py_is_legal_utf8(m2),
683+
self._bson_is_legal_utf8(m2)
684+
)
685+
686+
for b3 in map(chr, range(255)):
687+
m3 = m2 + b3
688+
self.assertEqual(
689+
self._py_is_legal_utf8(m3),
690+
self._bson_is_legal_utf8(m3)
691+
)
692+
693+
for b4 in map(chr, range(255)):
694+
m4 = m3 + b4
695+
696+
self.assertEqual(
697+
self._py_is_legal_utf8(m4),
698+
self._bson_is_legal_utf8(m4)
699+
)
700+
701+
# In python3:
702+
# - 'bytes' are not checked with isLegalutf
703+
# - 'unicode' I cannot create unicode objects with invalid utf8, since it
704+
# would result in non valid code-points.
705+
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
706+
def test_legal_utf8_few_samples(self):
707+
good_samples = [
708+
'\xf4\x80\x80\x80',
709+
'\xf4\x8a\x80\x80',
710+
'\xf4\x8e\x80\x80',
711+
'\xf4\x81\x80\x80',
712+
]
713+
714+
for data in good_samples:
715+
self.assertEqual(
716+
self._py_is_legal_utf8(data),
717+
self._bson_is_legal_utf8(data)
718+
)
719+
720+
bad_samples = [
721+
'\xf4\x00\x80\x80',
722+
'\xf4\x3a\x80\x80',
723+
'\xf4\x7f\x80\x80',
724+
'\xf4\x90\x80\x80',
725+
'\xf4\xff\x80\x80',
726+
]
727+
728+
for data in bad_samples:
729+
self.assertEqual(
730+
self._py_is_legal_utf8(data),
731+
self._bson_is_legal_utf8(data),
732+
data
733+
)
734+
735+
653736
def test_null_character(self):
654737
doc = {"a": "\x00"}
655738
self.assertEqual(doc, BSON.encode(doc).decode())

0 commit comments

Comments
 (0)