@@ -650,6 +650,89 @@ def test_utf8(self):
650
650
z = {iso8859_bytes : "hello" }
651
651
self .assertRaises (InvalidStringData , BSON .encode , z )
652
652
653
+ # Verify that python and bson have the same understanding of
654
+ # legal utf-8 if the first byte is 0xf4 (244)
655
+ @staticmethod
656
+ def _py_is_legal_utf8 (x ):
657
+ try :
658
+ x .decode ('utf-8' )
659
+ return True
660
+ except UnicodeDecodeError :
661
+ return False
662
+
663
+ @staticmethod
664
+ def _bson_is_legal_utf8 (x ):
665
+ try :
666
+ BSON .encode ({'x' : x })
667
+ return True
668
+ except InvalidStringData :
669
+ return False
670
+
671
+ @unittest .skipIf (PY3 , "python3 has strong separation between bytes/unicode" )
672
+ def test_legal_utf8_full_coverage (self ):
673
+ # this tests takes 400 seconds. Which is too long to run each time.
674
+ # However it is the only one which covers all possible bit combinations
675
+ # in the 244 space.
676
+
677
+ b1 = chr (0xf4 )
678
+
679
+ for b2 in map (chr , range (255 )):
680
+ m2 = b1 + b2
681
+ self .assertEqual (
682
+ self ._py_is_legal_utf8 (m2 ),
683
+ self ._bson_is_legal_utf8 (m2 )
684
+ )
685
+
686
+ for b3 in map (chr , range (255 )):
687
+ m3 = m2 + b3
688
+ self .assertEqual (
689
+ self ._py_is_legal_utf8 (m3 ),
690
+ self ._bson_is_legal_utf8 (m3 )
691
+ )
692
+
693
+ for b4 in map (chr , range (255 )):
694
+ m4 = m3 + b4
695
+
696
+ self .assertEqual (
697
+ self ._py_is_legal_utf8 (m4 ),
698
+ self ._bson_is_legal_utf8 (m4 )
699
+ )
700
+
701
+ # In python3:
702
+ # - 'bytes' are not checked with isLegalutf
703
+ # - 'unicode' I cannot create unicode objects with invalid utf8, since it
704
+ # would result in non valid code-points.
705
+ @unittest .skipIf (PY3 , "python3 has strong separation between bytes/unicode" )
706
+ def test_legal_utf8_few_samples (self ):
707
+ good_samples = [
708
+ '\xf4 \x80 \x80 \x80 ' ,
709
+ '\xf4 \x8a \x80 \x80 ' ,
710
+ '\xf4 \x8e \x80 \x80 ' ,
711
+ '\xf4 \x81 \x80 \x80 ' ,
712
+ ]
713
+
714
+ for data in good_samples :
715
+ self .assertEqual (
716
+ self ._py_is_legal_utf8 (data ),
717
+ self ._bson_is_legal_utf8 (data )
718
+ )
719
+
720
+ bad_samples = [
721
+ '\xf4 \x00 \x80 \x80 ' ,
722
+ '\xf4 \x3a \x80 \x80 ' ,
723
+ '\xf4 \x7f \x80 \x80 ' ,
724
+ '\xf4 \x90 \x80 \x80 ' ,
725
+ '\xf4 \xff \x80 \x80 ' ,
726
+ ]
727
+
728
+ for data in bad_samples :
729
+ self .assertEqual (
730
+ self ._py_is_legal_utf8 (data ),
731
+ self ._bson_is_legal_utf8 (data ),
732
+ data
733
+ )
734
+
735
+
653
736
def test_null_character (self ):
654
737
doc = {"a" : "\x00 " }
655
738
self .assertEqual (doc , BSON .encode (doc ).decode ())
0 commit comments