Skip to content
7 changes: 6 additions & 1 deletion Lib/test/test_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from http.client import HTTPException
import sys
from unicodedata import normalize, unidata_version
from unicodedata import normalize, is_normalized, unidata_version

TESTDATAFILE = "NormalizationTest.txt"
TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
Expand Down Expand Up @@ -88,6 +88,11 @@ def run_normalization_tests(self, testdata):
NFKD(c3) == NFKD(c4) == NFKD(c5),
line)

self.assertTrue(is_normalized("NFC", c2))
self.assertTrue(is_normalized("NFD", c3))
self.assertTrue(is_normalized("NFKC", c4))
self.assertTrue(is_normalized("NFKD", c5))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be some negative cases, too. Make sure the MAYBE case is being exercised.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Increased coverage + confirmed that this is exercising the MAYBE path.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add also tests when it returns False. If the function always returns True, the test still pass ;-)


# Record part 1 data
if part == "@Part1":
part1_data[c1] = 1
Expand Down
34 changes: 33 additions & 1 deletion Modules/clinic/unicodedata.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

92 changes: 82 additions & 10 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -766,8 +766,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return result;
}

/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static int
typedef enum {YES, NO, MAYBE} NormalMode;

/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
static NormalMode
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
Py_ssize_t i, len;
Expand All @@ -778,7 +780,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self && UCD_Check(self))
return 0;
return NO;

/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
Expand All @@ -795,14 +797,83 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
unsigned char quickcheck = record->normalization_quick_check;

if (quickcheck & quickcheck_mask)
return 0; /* this string might need normalization */
return MAYBE; /* this string might need normalization */
if (combining && prev_combining > combining)
return 0; /* non-canonical sort order, not normalized */
return NO; /* non-canonical sort order, not normalized */
prev_combining = combining;
}
return 1; /* certainly normalized */
return YES; /* certainly normalized */
}

/*[clinic input]
unicodedata.UCD.is_normalized

self: self
form: str
unistr as input: object(subclass_of='&PyUnicode_Type')
/

Return whether the Unicode string unistr is in the normal form 'form'.

Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/

static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject *self, const char *form,
PyObject *input)
/*[clinic end generated code: output=52d03aaa5b7cfe48 input=c80b54140a0af1ec]*/
{
if (PyUnicode_READY(input) == -1) {
return NULL;
}

if (PyUnicode_GET_LENGTH(input) == 0) {
/* Special case empty input strings. */
Py_INCREF(Py_True);
return Py_True;
}

PyObject *result;
int nfc = 0;
int k = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These could be bool.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is meant to conform to the existing implementation of is_normalized, which takes in ints. Could change is_normalized, but I preferred to avoid making changes outside the scope of my own.

NormalMode m;

PyObject *cmp;
int match = 0;

if (strcmp(form, "NFC") == 0) {
nfc = 1;
} else if (strcmp(form, "NFKC") == 0) {
nfc = 1;
k = 1;
} else if (strcmp(form, "NFD") == 0) {
/* Nothing to do. */
} else if (strcmp(form, "NFKD") == 0) {
k = 1;
} else {
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}

m = is_normalized(self, input, nfc, k);

if (m == MAYBE) {
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
if (cmp == NULL) {
return NULL;
}
match = PyUnicode_Compare(input, cmp);
Py_DECREF(cmp);
result = (match == 0) ? Py_True : Py_False;
} else {
result = (m == YES) ? Py_True : Py_False;
}

Py_INCREF(result);
return result;
}


/*[clinic input]
unicodedata.UCD.normalize

Expand All @@ -829,28 +900,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
}

if (strcmp(form, "NFC") == 0) {
if (is_normalized(self, input, 1, 0)) {
if (is_normalized(self, input, 1, 0) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0);
}
if (strcmp(form, "NFKC") == 0) {
if (is_normalized(self, input, 1, 1)) {
if (is_normalized(self, input, 1, 1) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1);
}
if (strcmp(form, "NFD") == 0) {
if (is_normalized(self, input, 0, 0)) {
if (is_normalized(self, input, 0, 0) == YES) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0);
}
if (strcmp(form, "NFKD") == 0) {
if (is_normalized(self, input, 0, 1)) {
if (is_normalized(self, input, 0, 1) == YES) {
Py_INCREF(input);
return input;
}
Expand Down Expand Up @@ -1267,6 +1338,7 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
{NULL, NULL} /* sentinel */
};
Expand Down