-
- Notifications
You must be signed in to change notification settings - Fork 33.4k
bpo-32285: Add unicodedata.is_normalized to check the current norma… #4806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
eea1543 7a4076c 591abc0 1ecc284 8db1a3c fb401d5 cf2a177 15be04f 697e35b 25f0623 bd823e5 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -766,8 +766,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) | |
| return result; | ||
| } | ||
| | ||
| /* Return 1 if the input is certainly normalized, 0 if it might not be. */ | ||
| static int | ||
| typedef enum {YES, NO, MAYBE} NormalMode; | ||
| | ||
| /* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ | ||
| static NormalMode | ||
| is_normalized(PyObject *self, PyObject *input, int nfc, int k) | ||
| { | ||
| Py_ssize_t i, len; | ||
| | @@ -778,7 +780,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) | |
| /* An older version of the database is requested, quickchecks must be | ||
| disabled. */ | ||
| if (self && UCD_Check(self)) | ||
| return 0; | ||
| return NO; | ||
| | ||
| /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, | ||
| as described in http://unicode.org/reports/tr15/#Annex8. */ | ||
| | @@ -795,14 +797,83 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) | |
| unsigned char quickcheck = record->normalization_quick_check; | ||
| | ||
| if (quickcheck & quickcheck_mask) | ||
| return 0; /* this string might need normalization */ | ||
| return MAYBE; /* this string might need normalization */ | ||
| if (combining && prev_combining > combining) | ||
| return 0; /* non-canonical sort order, not normalized */ | ||
| return NO; /* non-canonical sort order, not normalized */ | ||
| prev_combining = combining; | ||
| } | ||
| return 1; /* certainly normalized */ | ||
| return YES; /* certainly normalized */ | ||
| } | ||
| | ||
| /*[clinic input] | ||
| unicodedata.UCD.is_normalized | ||
| | ||
| self: self | ||
| form: str | ||
maxbelanger marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| unistr as input: object(subclass_of='&PyUnicode_Type') | ||
| / | ||
| | ||
| Return whether the Unicode string unistr is in the normal form 'form'. | ||
| | ||
| Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. | ||
| [clinic start generated code]*/ | ||
| | ||
| static PyObject * | ||
| unicodedata_UCD_is_normalized_impl(PyObject *self, const char *form, | ||
| PyObject *input) | ||
| /*[clinic end generated code: output=52d03aaa5b7cfe48 input=c80b54140a0af1ec]*/ | ||
| { | ||
| if (PyUnicode_READY(input) == -1) { | ||
| return NULL; | ||
| } | ||
| | ||
| if (PyUnicode_GET_LENGTH(input) == 0) { | ||
| /* Special case empty input strings. */ | ||
| Py_INCREF(Py_True); | ||
| return Py_True; | ||
maxbelanger marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| } | ||
| | ||
| PyObject *result; | ||
| int nfc = 0; | ||
| int k = 0; | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These could be Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is meant to conform to the existing implementation of | ||
| NormalMode m; | ||
| | ||
| PyObject *cmp; | ||
| int match = 0; | ||
| | ||
| if (strcmp(form, "NFC") == 0) { | ||
| nfc = 1; | ||
| } else if (strcmp(form, "NFKC") == 0) { | ||
| nfc = 1; | ||
| k = 1; | ||
| } else if (strcmp(form, "NFD") == 0) { | ||
| /* Nothing to do. */ | ||
| } else if (strcmp(form, "NFKD") == 0) { | ||
| k = 1; | ||
| } else { | ||
| PyErr_SetString(PyExc_ValueError, "invalid normalization form"); | ||
| return NULL; | ||
| } | ||
| | ||
| m = is_normalized(self, input, nfc, k); | ||
| | ||
| if (m == MAYBE) { | ||
| cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); | ||
| if (cmp == NULL) { | ||
| return NULL; | ||
| } | ||
| match = PyUnicode_Compare(input, cmp); | ||
| Py_DECREF(cmp); | ||
| result = (match == 0) ? Py_True : Py_False; | ||
| } else { | ||
maxbelanger marked this conversation as resolved. Outdated Show resolved Hide resolved | ||
| result = (m == YES) ? Py_True : Py_False; | ||
| } | ||
| | ||
| Py_INCREF(result); | ||
| return result; | ||
| } | ||
| | ||
| | ||
| /*[clinic input] | ||
| unicodedata.UCD.normalize | ||
| | ||
| | @@ -829,28 +900,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form, | |
| } | ||
| | ||
| if (strcmp(form, "NFC") == 0) { | ||
| if (is_normalized(self, input, 1, 0)) { | ||
| if (is_normalized(self, input, 1, 0) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| return nfc_nfkc(self, input, 0); | ||
| } | ||
| if (strcmp(form, "NFKC") == 0) { | ||
| if (is_normalized(self, input, 1, 1)) { | ||
| if (is_normalized(self, input, 1, 1) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| return nfc_nfkc(self, input, 1); | ||
| } | ||
| if (strcmp(form, "NFD") == 0) { | ||
| if (is_normalized(self, input, 0, 0)) { | ||
| if (is_normalized(self, input, 0, 0) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| return nfd_nfkd(self, input, 0); | ||
| } | ||
| if (strcmp(form, "NFKD") == 0) { | ||
| if (is_normalized(self, input, 0, 1)) { | ||
| if (is_normalized(self, input, 0, 1) == YES) { | ||
| Py_INCREF(input); | ||
| return input; | ||
| } | ||
| | @@ -1267,6 +1338,7 @@ static PyMethodDef unicodedata_functions[] = { | |
| UNICODEDATA_UCD_DECOMPOSITION_METHODDEF | ||
| UNICODEDATA_UCD_NAME_METHODDEF | ||
| UNICODEDATA_UCD_LOOKUP_METHODDEF | ||
| UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF | ||
| UNICODEDATA_UCD_NORMALIZE_METHODDEF | ||
| {NULL, NULL} /* sentinel */ | ||
| }; | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There should be some negative cases, too. Make sure the
MAYBEcase is being exercised.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Increased coverage + confirmed that this is exercising the
MAYBEpath.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe add also tests when it returns False. If the function always returns True, the test still pass ;-)