Skip to content

Commit 2810dd7

Browse files
maxbelangerbenjaminp
authored andcommitted
closes bpo-32285: Add unicodedata.is_normalized. (GH-4806)
1 parent 5d236ca commit 2810dd7

File tree

6 files changed

+160
-22
lines changed

6 files changed

+160
-22
lines changed

Doc/library/unicodedata.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,13 @@ following functions:
133133
a human reader, if one has combining characters and the other
134134
doesn't, they may not compare equal.
135135

136+
.. function:: is_normalized(form, unistr)
137+
138+
Return whether the Unicode string *unistr* is in the normal form *form*. Valid
139+
values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
140+
141+
.. versionadded:: 3.8
142+
136143

137144
In addition, the module exposes the following constant:
138145

Doc/whatsnew/3.8.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
204204
in the :class:`tkinter.Canvas` class.
205205
(Contributed by Juliette Monsel in :issue:`23831`.)
206206

207+
unicodedata
208+
-----------
209+
210+
* New function :func:`~unicodedata.is_normalized` can be used to verify a string
211+
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
212+
:issue:`32285`).
213+
207214
venv
208215
----
209216

Lib/test/test_normalization.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from http.client import HTTPException
55
import sys
6-
from unicodedata import normalize, unidata_version
6+
from unicodedata import normalize, is_normalized, unidata_version
77

88
TESTDATAFILE = "NormalizationTest.txt"
99
TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
@@ -88,6 +88,15 @@ def run_normalization_tests(self, testdata):
8888
NFKD(c3) == NFKD(c4) == NFKD(c5),
8989
line)
9090

91+
self.assertTrue(is_normalized("NFC", c2))
92+
self.assertTrue(is_normalized("NFC", c4))
93+
94+
self.assertTrue(is_normalized("NFD", c3))
95+
self.assertTrue(is_normalized("NFD", c5))
96+
97+
self.assertTrue(is_normalized("NFKC", c4))
98+
self.assertTrue(is_normalized("NFKD", c5))
99+
91100
# Record part 1 data
92101
if part == "@Part1":
93102
part1_data[c1] = 1
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
New function unicodedata.is_normalized, which can check whether a string is
2+
in a specific normal form.

Modules/clinic/unicodedata.c.h

Lines changed: 36 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Modules/unicodedata.c

Lines changed: 98 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919
#include "ucnhash.h"
2020
#include "structmember.h"
2121

22+
_Py_IDENTIFIER(NFC);
23+
_Py_IDENTIFIER(NFD);
24+
_Py_IDENTIFIER(NFKC);
25+
_Py_IDENTIFIER(NFKD);
26+
2227
/*[clinic input]
2328
module unicodedata
2429
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
770775
return result;
771776
}
772777

773-
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
774-
static int
778+
typedef enum {YES, NO, MAYBE} NormalMode;
779+
780+
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
781+
static NormalMode
775782
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
776783
{
777784
Py_ssize_t i, len;
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
782789
/* An older version of the database is requested, quickchecks must be
783790
disabled. */
784791
if (self && UCD_Check(self))
785-
return 0;
792+
return NO;
786793

787794
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
788795
as described in http://unicode.org/reports/tr15/#Annex8. */
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
799806
unsigned char quickcheck = record->normalization_quick_check;
800807

801808
if (quickcheck & quickcheck_mask)
802-
return 0; /* this string might need normalization */
809+
return MAYBE; /* this string might need normalization */
803810
if (combining && prev_combining > combining)
804-
return 0; /* non-canonical sort order, not normalized */
811+
return NO; /* non-canonical sort order, not normalized */
805812
prev_combining = combining;
806813
}
807-
return 1; /* certainly normalized */
814+
return YES; /* certainly normalized */
815+
}
816+
817+
/*[clinic input]
818+
unicodedata.UCD.is_normalized
819+
820+
self: self
821+
form: unicode
822+
unistr as input: unicode
823+
/
824+
825+
Return whether the Unicode string unistr is in the normal form 'form'.
826+
827+
Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
828+
[clinic start generated code]*/
829+
830+
static PyObject *
831+
unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
832+
PyObject *input)
833+
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
834+
{
835+
if (PyUnicode_READY(input) == -1) {
836+
return NULL;
837+
}
838+
839+
if (PyUnicode_GET_LENGTH(input) == 0) {
840+
/* special case empty input strings. */
841+
Py_RETURN_TRUE;
842+
}
843+
844+
PyObject *result;
845+
int nfc = 0;
846+
int k = 0;
847+
NormalMode m;
848+
849+
PyObject *cmp;
850+
int match = 0;
851+
852+
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
853+
nfc = 1;
854+
}
855+
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
856+
nfc = 1;
857+
k = 1;
858+
}
859+
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
860+
/* matches default values for `nfc` and `k` */
861+
}
862+
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
863+
k = 1;
864+
}
865+
else {
866+
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
867+
return NULL;
868+
}
869+
870+
m = is_normalized(self, input, nfc, k);
871+
872+
if (m == MAYBE) {
873+
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
874+
if (cmp == NULL) {
875+
return NULL;
876+
}
877+
match = PyUnicode_Compare(input, cmp);
878+
Py_DECREF(cmp);
879+
result = (match == 0) ? Py_True : Py_False;
880+
}
881+
else {
882+
result = (m == YES) ? Py_True : Py_False;
883+
}
884+
885+
Py_INCREF(result);
886+
return result;
808887
}
809888

889+
810890
/*[clinic input]
811891
unicodedata.UCD.normalize
812892
813893
self: self
814-
form: str
894+
form: unicode
815895
unistr as input: unicode
816896
/
817897
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
821901
[clinic start generated code]*/
822902

823903
static PyObject *
824-
unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
904+
unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
825905
PyObject *input)
826-
/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
906+
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
827907
{
828908
if (PyUnicode_GET_LENGTH(input) == 0) {
829909
/* Special case empty input strings, since resizing
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
832912
return input;
833913
}
834914

835-
if (strcmp(form, "NFC") == 0) {
836-
if (is_normalized(self, input, 1, 0)) {
915+
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
916+
if (is_normalized(self, input, 1, 0) == YES) {
837917
Py_INCREF(input);
838918
return input;
839919
}
840920
return nfc_nfkc(self, input, 0);
841921
}
842-
if (strcmp(form, "NFKC") == 0) {
843-
if (is_normalized(self, input, 1, 1)) {
922+
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
923+
if (is_normalized(self, input, 1, 1) == YES) {
844924
Py_INCREF(input);
845925
return input;
846926
}
847927
return nfc_nfkc(self, input, 1);
848928
}
849-
if (strcmp(form, "NFD") == 0) {
850-
if (is_normalized(self, input, 0, 0)) {
929+
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
930+
if (is_normalized(self, input, 0, 0) == YES) {
851931
Py_INCREF(input);
852932
return input;
853933
}
854934
return nfd_nfkd(self, input, 0);
855935
}
856-
if (strcmp(form, "NFKD") == 0) {
857-
if (is_normalized(self, input, 0, 1)) {
936+
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
937+
if (is_normalized(self, input, 0, 1) == YES) {
858938
Py_INCREF(input);
859939
return input;
860940
}
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
12711351
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
12721352
UNICODEDATA_UCD_NAME_METHODDEF
12731353
UNICODEDATA_UCD_LOOKUP_METHODDEF
1354+
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
12741355
UNICODEDATA_UCD_NORMALIZE_METHODDEF
12751356
{NULL, NULL} /* sentinel */
12761357
};

0 commit comments

Comments
 (0)