python · benjaminp · Nov 4, 2018 · Dec 12, 2017 · Dec 12, 2017 · Dec 12, 2017
diff --git a/Lib/test/test_normalization.py b/Lib/test/test_normalization.py
@@ -3,7 +3,7 @@
 
 from http.client import HTTPException
 import sys
-from unicodedata import normalize, unidata_version
+from unicodedata import normalize, is_normalized, unidata_version
 
 TESTDATAFILE = "NormalizationTest.txt"
 TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
@@ -88,6 +88,11 @@ def run_normalization_tests(self, testdata):
  NFKD(c3) == NFKD(c4) == NFKD(c5),
  line)
 
+ self.assertTrue(is_normalized("NFC", c2))
+ self.assertTrue(is_normalized("NFD", c3))
+ self.assertTrue(is_normalized("NFKC", c4))
+ self.assertTrue(is_normalized("NFKD", c5))
+
  # Record part 1 data
  if part == "@Part1":
  part1_data[c1] = 1

diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
@@ -766,8 +766,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
  return result;
 }
 
-/* Return 1 if the input is certainly normalized, 0 if it might not be. */
-static int
+typedef enum {YES, NO, MAYBE} NormalMode;
+
+/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
+static NormalMode
 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 {
  Py_ssize_t i, len;
@@ -778,7 +780,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
  /* An older version of the database is requested, quickchecks must be
  disabled. */
  if (self && UCD_Check(self))
- return 0;
+ return NO;
 
  /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
  as described in http://unicode.org/reports/tr15/#Annex8. */
@@ -795,14 +797,83 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
  unsigned char quickcheck = record->normalization_quick_check;
 
  if (quickcheck & quickcheck_mask)
- return 0; /* this string might need normalization */
+ return MAYBE; /* this string might need normalization */
  if (combining && prev_combining > combining)
- return 0; /* non-canonical sort order, not normalized */
+ return NO; /* non-canonical sort order, not normalized */
  prev_combining = combining;
  }
- return 1; /* certainly normalized */
+ return YES; /* certainly normalized */
 }
 
+/*[clinic input]
+unicodedata.UCD.is_normalized
+
+ self: self
+ form: str
+ unistr as input: object(subclass_of='&PyUnicode_Type')
+ /
+
+Return whether the Unicode string unistr is in the normal form 'form'.
+
+Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_UCD_is_normalized_impl(PyObject *self, const char *form,
+ PyObject *input)
+/*[clinic end generated code: output=52d03aaa5b7cfe48 input=c80b54140a0af1ec]*/
+{
+ if (PyUnicode_READY(input) == -1) {
+ return NULL;
+ }
+
+ if (PyUnicode_GET_LENGTH(input) == 0) {
+ /* Special case empty input strings. */
+ Py_INCREF(Py_True);
+ return Py_True;
+ }
+
+ PyObject *result;
+ int nfc = 0;
+ int k = 0;
+ NormalMode m;
+
+ PyObject *cmp;
+ int match = 0;
+
+ if (strcmp(form, "NFC") == 0) {
+ nfc = 1;
+ } else if (strcmp(form, "NFKC") == 0) {
+ nfc = 1;
+ k = 1;
+ } else if (strcmp(form, "NFD") == 0) {
+ /* Nothing to do. */
+ } else if (strcmp(form, "NFKD") == 0) {
+ k = 1;
+ } else {
+ PyErr_SetString(PyExc_ValueError, "invalid normalization form");
+ return NULL;
+ }
+
+ m = is_normalized(self, input, nfc, k);
+
+ if (m == MAYBE) {
+ cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
+ if (cmp == NULL) {
+ return NULL;
+ }
+ match = PyUnicode_Compare(input, cmp);
+ Py_DECREF(cmp);
+ result = (match == 0) ? Py_True : Py_False;
+ } else {
+ result = (m == YES) ? Py_True : Py_False;
+ }
+
+ Py_INCREF(result);
+ return result;
+}
+
+
 /*[clinic input]
 unicodedata.UCD.normalize
 
@@ -829,28 +900,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
  }
 
  if (strcmp(form, "NFC") == 0) {
- if (is_normalized(self, input, 1, 0)) {
+ if (is_normalized(self, input, 1, 0) == YES) {
  Py_INCREF(input);
  return input;
  }
  return nfc_nfkc(self, input, 0);
  }
  if (strcmp(form, "NFKC") == 0) {
- if (is_normalized(self, input, 1, 1)) {
+ if (is_normalized(self, input, 1, 1) == YES) {
  Py_INCREF(input);
  return input;
  }
  return nfc_nfkc(self, input, 1);
  }
  if (strcmp(form, "NFD") == 0) {
- if (is_normalized(self, input, 0, 0)) {
+ if (is_normalized(self, input, 0, 0) == YES) {
  Py_INCREF(input);
  return input;
  }
  return nfd_nfkd(self, input, 0);
  }
  if (strcmp(form, "NFKD") == 0) {
- if (is_normalized(self, input, 0, 1)) {
+ if (is_normalized(self, input, 0, 1) == YES) {
  Py_INCREF(input);
  return input;
  }
@@ -1267,6 +1338,7 @@ static PyMethodDef unicodedata_functions[] = {
  UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
  UNICODEDATA_UCD_NAME_METHODDEF
  UNICODEDATA_UCD_LOOKUP_METHODDEF
+ UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
  UNICODEDATA_UCD_NORMALIZE_METHODDEF
  {NULL, NULL} /* sentinel */
 };