3333]
3434
3535
36- def _encode_numpy (values , uniques = None , encode = False ):
36+ def _encode_numpy (values , uniques = None , encode = False , check_unknown = True ):
3737 # only used in _encode below, see docstring there for details
3838 if uniques is None :
3939 if encode :
@@ -43,10 +43,11 @@ def _encode_numpy(values, uniques=None, encode=False):
4343 # unique sorts
4444 return np .unique (values )
4545 if encode :
46- diff = _encode_check_unknown (values , uniques )
47- if diff :
48- raise ValueError ("y contains previously unseen labels: %s"
49- % str (diff ))
46+ if check_unknown :
47+ diff = _encode_check_unknown (values , uniques )
48+ if diff :
49+ raise ValueError ("y contains previously unseen labels: %s"
50+ % str (diff ))
5051 encoded = np .searchsorted (uniques , values )
5152 return uniques , encoded
5253 else :
@@ -70,7 +71,7 @@ def _encode_python(values, uniques=None, encode=False):
7071 return uniques
7172
7273
73- def _encode (values , uniques = None , encode = False ):
74+ def _encode (values , uniques = None , encode = False , check_unknown = True ):
7475 """Helper function to factorize (find uniques) and encode values.
7576
7677 Uses pure python method for object dtype, and numpy method for
@@ -90,6 +91,12 @@ def _encode(values, uniques=None, encode=False):
9091 already have been determined in fit).
9192 encode : bool, default False
9293 If True, also encode the values into integer codes based on `uniques`.
94+ check_unknown : bool, default True
95+ If True, check for values in ``values`` that are not in ``unique``
96+ and raise an error. This is ignored for object dtype, and treated as
97+ True in this case. This parameter is useful for
98+ _BaseEncoder._transform() to avoid calling _encode_check_unknown()
99+ twice.
93100
94101 Returns
95102 -------
@@ -107,7 +114,8 @@ def _encode(values, uniques=None, encode=False):
107114 raise TypeError ("argument must be a string or number" )
108115 return res
109116 else :
110- return _encode_numpy (values , uniques , encode )
117+ return _encode_numpy (values , uniques , encode ,
118+ check_unknown = check_unknown )
111119
112120
113121def _encode_check_unknown (values , uniques , return_mask = False ):
0 commit comments