Skip to content

Commit 8573099

Browse files
committed
Move time consuming utf8 tests out of the main suite
1 parent 9b632c7 commit 8573099

File tree

2 files changed

+76
-63
lines changed

2 files changed

+76
-63
lines changed

test/test_bson.py

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -650,69 +650,6 @@ def test_utf8(self):
650650
z = {iso8859_bytes: "hello"}
651651
self.assertRaises(InvalidStringData, BSON.encode, z)
652652

653-
# Verify that python and bson have the same understanding of
654-
# legal utf-8 if the first byte is 0xf4 (244)
655-
def _assert_same_utf8_validation(self, data):
656-
try:
657-
data.decode('utf-8')
658-
py_is_legal = True
659-
except UnicodeDecodeError:
660-
py_is_legal = False
661-
662-
try:
663-
BSON.encode({'x': data})
664-
bson_is_legal = True
665-
except InvalidStringData:
666-
bson_is_legal = False
667-
668-
self.assertEqual(py_is_legal, bson_is_legal, data)
669-
670-
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
671-
def test_legal_utf8_full_coverage(self):
672-
# this tests takes 400 seconds. Which is too long to run each time.
673-
# However it is the only one which covers all possible bit combinations
674-
# in the 244 space.
675-
b1 = chr(0xf4)
676-
677-
for b2 in map(chr, range(255)):
678-
m2 = b1 + b2
679-
self._assert_same_utf8_validation(m2)
680-
681-
for b3 in map(chr, range(255)):
682-
m3 = m2 + b3
683-
self._assert_same_utf8_validation(m3)
684-
685-
for b4 in map(chr, range(255)):
686-
m4 = m3 + b4
687-
self._assert_same_utf8_validation(m4)
688-
689-
# In python3:
690-
# - 'bytes' are not checked with isLegalutf
691-
# - 'unicode' I cannot create unicode objects with invalid utf8, since it
692-
# would result in non valid code-points.
693-
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
694-
def test_legal_utf8_few_samples(self):
695-
good_samples = [
696-
'\xf4\x80\x80\x80',
697-
'\xf4\x8a\x80\x80',
698-
'\xf4\x8e\x80\x80',
699-
'\xf4\x81\x80\x80',
700-
]
701-
702-
for data in good_samples:
703-
self._assert_same_utf8_validation(data)
704-
705-
bad_samples = [
706-
'\xf4\x00\x80\x80',
707-
'\xf4\x3a\x80\x80',
708-
'\xf4\x7f\x80\x80',
709-
'\xf4\x90\x80\x80',
710-
'\xf4\xff\x80\x80',
711-
]
712-
713-
for data in bad_samples:
714-
self._assert_same_utf8_validation(data)
715-
716653
def test_null_character(self):
717654
doc = {"a": "\x00"}
718655
self.assertEqual(doc, BSON.encode(doc).decode())

test/unicode/test_utf8.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import sys
2+
3+
sys.path[0:0] = [""]
4+
5+
from bson import BSON
6+
from bson.errors import InvalidStringData
7+
from bson.py3compat import PY3
8+
from test import unittest
9+
10+
class TestUTF8(unittest.TestCase):
11+
12+
# Verify that python and bson have the same understanding of
13+
# legal utf-8 if the first byte is 0xf4 (244)
14+
def _assert_same_utf8_validation(self, data):
15+
try:
16+
data.decode('utf-8')
17+
py_is_legal = True
18+
except UnicodeDecodeError:
19+
py_is_legal = False
20+
21+
try:
22+
BSON.encode({'x': data})
23+
bson_is_legal = True
24+
except InvalidStringData:
25+
bson_is_legal = False
26+
27+
self.assertEqual(py_is_legal, bson_is_legal, data)
28+
29+
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
30+
def test_legal_utf8_full_coverage(self):
31+
# This test takes 400 seconds. Which is too long to run each time.
32+
# However it is the only one which covers all possible bit combinations
33+
# in the 244 space.
34+
b1 = chr(0xf4)
35+
36+
for b2 in map(chr, range(255)):
37+
m2 = b1 + b2
38+
self._assert_same_utf8_validation(m2)
39+
40+
for b3 in map(chr, range(255)):
41+
m3 = m2 + b3
42+
self._assert_same_utf8_validation(m3)
43+
44+
for b4 in map(chr, range(255)):
45+
m4 = m3 + b4
46+
self._assert_same_utf8_validation(m4)
47+
48+
# In python3:
49+
# - 'bytes' are not checked with isLegalutf
50+
# - 'unicode' We cannot create unicode objects with invalid utf8, since it
51+
# would result in non valid code-points.
52+
@unittest.skipIf(PY3, "python3 has strong separation between bytes/unicode")
53+
def test_legal_utf8_few_samples(self):
54+
good_samples = [
55+
'\xf4\x80\x80\x80',
56+
'\xf4\x8a\x80\x80',
57+
'\xf4\x8e\x80\x80',
58+
'\xf4\x81\x80\x80',
59+
]
60+
61+
for data in good_samples:
62+
self._assert_same_utf8_validation(data)
63+
64+
bad_samples = [
65+
'\xf4\x00\x80\x80',
66+
'\xf4\x3a\x80\x80',
67+
'\xf4\x7f\x80\x80',
68+
'\xf4\x90\x80\x80',
69+
'\xf4\xff\x80\x80',
70+
]
71+
72+
for data in bad_samples:
73+
self._assert_same_utf8_validation(data)
74+
75+
if __name__ == "__main__":
76+
unittest.main()

0 commit comments

Comments
 (0)