1919#include "ucnhash.h"
2020#include "structmember.h"
2121
22+ _Py_IDENTIFIER (NFC );
23+ _Py_IDENTIFIER (NFD );
24+ _Py_IDENTIFIER (NFKC );
25+ _Py_IDENTIFIER (NFKD );
26+
2227/*[clinic input]
2328module unicodedata
2429class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
770775 return result ;
771776}
772777
773- /* Return 1 if the input is certainly normalized, 0 if it might not be. */
774- static int
778+ typedef enum {YES , NO , MAYBE } NormalMode ;
779+
780+ /* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
781+ static NormalMode
775782is_normalized (PyObject * self , PyObject * input , int nfc , int k )
776783{
777784 Py_ssize_t i , len ;
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
782789 /* An older version of the database is requested, quickchecks must be
783790 disabled. */
784791 if (self && UCD_Check (self ))
785- return 0 ;
792+ return NO ;
786793
787794 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
788795 as described in http://unicode.org/reports/tr15/#Annex8. */
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
799806 unsigned char quickcheck = record -> normalization_quick_check ;
800807
801808 if (quickcheck & quickcheck_mask )
802- return 0 ; /* this string might need normalization */
809+ return MAYBE ; /* this string might need normalization */
803810 if (combining && prev_combining > combining )
804- return 0 ; /* non-canonical sort order, not normalized */
811+ return NO ; /* non-canonical sort order, not normalized */
805812 prev_combining = combining ;
806813 }
807- return 1 ; /* certainly normalized */
814+ return YES ; /* certainly normalized */
815+ }
816+
817+ /*[clinic input]
818+ unicodedata.UCD.is_normalized
819+
820+ self: self
821+ form: unicode
822+ unistr as input: unicode
823+ /
824+
825+ Return whether the Unicode string unistr is in the normal form 'form'.
826+
827+ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
828+ [clinic start generated code]*/
829+
830+ static PyObject *
831+ unicodedata_UCD_is_normalized_impl (PyObject * self , PyObject * form ,
832+ PyObject * input )
833+ /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
834+ {
835+ if (PyUnicode_READY (input ) == -1 ) {
836+ return NULL ;
837+ }
838+
839+ if (PyUnicode_GET_LENGTH (input ) == 0 ) {
840+ /* special case empty input strings. */
841+ Py_RETURN_TRUE ;
842+ }
843+
844+ PyObject * result ;
845+ int nfc = 0 ;
846+ int k = 0 ;
847+ NormalMode m ;
848+
849+ PyObject * cmp ;
850+ int match = 0 ;
851+
852+ if (_PyUnicode_EqualToASCIIId (form , & PyId_NFC )) {
853+ nfc = 1 ;
854+ }
855+ else if (_PyUnicode_EqualToASCIIId (form , & PyId_NFKC )) {
856+ nfc = 1 ;
857+ k = 1 ;
858+ }
859+ else if (_PyUnicode_EqualToASCIIId (form , & PyId_NFD )) {
860+ /* matches default values for `nfc` and `k` */
861+ }
862+ else if (_PyUnicode_EqualToASCIIId (form , & PyId_NFKD )) {
863+ k = 1 ;
864+ }
865+ else {
866+ PyErr_SetString (PyExc_ValueError , "invalid normalization form" );
867+ return NULL ;
868+ }
869+
870+ m = is_normalized (self , input , nfc , k );
871+
872+ if (m == MAYBE ) {
873+ cmp = (nfc ? nfc_nfkc : nfd_nfkd )(self , input , k );
874+ if (cmp == NULL ) {
875+ return NULL ;
876+ }
877+ match = PyUnicode_Compare (input , cmp );
878+ Py_DECREF (cmp );
879+ result = (match == 0 ) ? Py_True : Py_False ;
880+ }
881+ else {
882+ result = (m == YES ) ? Py_True : Py_False ;
883+ }
884+
885+ Py_INCREF (result );
886+ return result ;
808887}
809888
889+
810890/*[clinic input]
811891unicodedata.UCD.normalize
812892
813893 self: self
814- form: str
894+ form: unicode
815895 unistr as input: unicode
816896 /
817897
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
821901[clinic start generated code]*/
822902
823903static PyObject *
824- unicodedata_UCD_normalize_impl (PyObject * self , const char * form ,
904+ unicodedata_UCD_normalize_impl (PyObject * self , PyObject * form ,
825905 PyObject * input )
826- /*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0 ]*/
906+ /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb ]*/
827907{
828908 if (PyUnicode_GET_LENGTH (input ) == 0 ) {
829909 /* Special case empty input strings, since resizing
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
832912 return input ;
833913 }
834914
835- if (strcmp (form , "NFC" ) == 0 ) {
836- if (is_normalized (self , input , 1 , 0 )) {
915+ if (_PyUnicode_EqualToASCIIId (form , & PyId_NFC ) ) {
916+ if (is_normalized (self , input , 1 , 0 ) == YES ) {
837917 Py_INCREF (input );
838918 return input ;
839919 }
840920 return nfc_nfkc (self , input , 0 );
841921 }
842- if (strcmp (form , "NFKC" ) == 0 ) {
843- if (is_normalized (self , input , 1 , 1 )) {
922+ if (_PyUnicode_EqualToASCIIId (form , & PyId_NFKC ) ) {
923+ if (is_normalized (self , input , 1 , 1 ) == YES ) {
844924 Py_INCREF (input );
845925 return input ;
846926 }
847927 return nfc_nfkc (self , input , 1 );
848928 }
849- if (strcmp (form , "NFD" ) == 0 ) {
850- if (is_normalized (self , input , 0 , 0 )) {
929+ if (_PyUnicode_EqualToASCIIId (form , & PyId_NFD ) ) {
930+ if (is_normalized (self , input , 0 , 0 ) == YES ) {
851931 Py_INCREF (input );
852932 return input ;
853933 }
854934 return nfd_nfkd (self , input , 0 );
855935 }
856- if (strcmp (form , "NFKD" ) == 0 ) {
857- if (is_normalized (self , input , 0 , 1 )) {
936+ if (_PyUnicode_EqualToASCIIId (form , & PyId_NFKD ) ) {
937+ if (is_normalized (self , input , 0 , 1 ) == YES ) {
858938 Py_INCREF (input );
859939 return input ;
860940 }
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
12711351 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
12721352 UNICODEDATA_UCD_NAME_METHODDEF
12731353 UNICODEDATA_UCD_LOOKUP_METHODDEF
1354+ UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
12741355 UNICODEDATA_UCD_NORMALIZE_METHODDEF
12751356 {NULL , NULL } /* sentinel */
12761357};
0 commit comments