1-
21"""
32Logistic Regression
43"""
2827from ..utils .extmath import row_norms
2928from ..utils .optimize import newton_cg
3029from ..utils .validation import check_X_y
31- from ..exceptions import DataConversionWarning
3230from ..exceptions import NotFittedError
3331from ..utils .fixes import expit
3432from ..utils .multiclass import check_classification_targets
@@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
925923 y_test = np .ones (y_test .shape , dtype = np .float64 )
926924 y_test [~ mask ] = - 1.
927925
928- # To deal with object dtypes, we need to convert into an array of floats.
929- y_test = check_array (y_test , dtype = np .float64 , ensure_2d = False )
930-
931926 scores = list ()
932927
933928 if isinstance (scoring , six .string_types ):
@@ -1561,64 +1556,64 @@ def fit(self, X, y, sample_weight=None):
15611556
15621557 X , y = check_X_y (X , y , accept_sparse = 'csr' , dtype = np .float64 ,
15631558 order = "C" )
1559+ check_classification_targets (y )
1560+
1561+ class_weight = self .class_weight
1562+ if class_weight and not (isinstance (class_weight , dict ) or
1563+ class_weight in ['balanced' , 'auto' ]):
1564+ # 'auto' is deprecated and will be removed in 0.19
1565+ raise ValueError ("class_weight provided should be a "
1566+ "dict or 'balanced'" )
1567+
1568+ # Encode for string labels
1569+ label_encoder = LabelEncoder ().fit (y )
1570+ y = label_encoder .transform (y )
1571+ if isinstance (class_weight , dict ):
1572+ class_weight = dict ((label_encoder .transform ([cls ])[0 ], v )
1573+ for cls , v in class_weight .items ())
1574+
1575+ # The original class labels
1576+ classes = self .classes_ = label_encoder .classes_
1577+ encoded_labels = label_encoder .transform (label_encoder .classes_ )
15641578
15651579 if self .solver == 'sag' :
15661580 max_squared_sum = row_norms (X , squared = True ).max ()
15671581 else :
15681582 max_squared_sum = None
15691583
1570- check_classification_targets (y )
1571-
1572- if y .ndim == 2 and y .shape [1 ] == 1 :
1573- warnings .warn (
1574- "A column-vector y was passed when a 1d array was"
1575- " expected. Please change the shape of y to "
1576- "(n_samples, ), for example using ravel()." ,
1577- DataConversionWarning )
1578- y = np .ravel (y )
1579-
1580- check_consistent_length (X , y )
1581-
15821584 # init cross-validation generator
15831585 cv = check_cv (self .cv , y , classifier = True )
15841586 folds = list (cv .split (X , y ))
15851587
1586- self ._enc = LabelEncoder ()
1587- self ._enc .fit (y )
1588-
1589- labels = self .classes_ = np .unique (y )
1590- n_classes = len (labels )
1588+ # Use the label encoded classes
1589+ n_classes = len (encoded_labels )
15911590
15921591 if n_classes < 2 :
15931592 raise ValueError ("This solver needs samples of at least 2 classes"
15941593 " in the data, but the data contains only one"
1595- " class: %r" % self .classes_ [0 ])
1594+ " class: %r" % classes [0 ])
1595+
15961596 if n_classes == 2 :
15971597 # OvR in case of binary problems is as good as fitting
15981598 # the higher label
15991599 n_classes = 1
1600- labels = labels [1 :]
1600+ encoded_labels = encoded_labels [1 :]
1601+ classes = classes [1 :]
16011602
16021603 # We need this hack to iterate only once over labels, in the case of
16031604 # multi_class = multinomial, without changing the value of the labels.
1604- iter_labels = labels
16051605 if self .multi_class == 'multinomial' :
1606- iter_labels = [None ]
1607-
1608- if self .class_weight and not (isinstance (self .class_weight , dict ) or
1609- self .class_weight in
1610- ['balanced' , 'auto' ]):
1611- # 'auto' is deprecated and will be removed in 0.19
1612- raise ValueError ("class_weight provided should be a "
1613- "dict or 'balanced'" )
1606+ iter_encoded_labels = iter_classes = [None ]
1607+ else :
1608+ iter_encoded_labels = encoded_labels
1609+ iter_classes = classes
16141610
16151611 # compute the class weights for the entire dataset y
1616- if self .class_weight in ("auto" , "balanced" ):
1617- classes = np .unique (y )
1618- class_weight = compute_class_weight (self .class_weight , classes , y )
1619- class_weight = dict (zip (classes , class_weight ))
1620- else :
1621- class_weight = self .class_weight
1612+ if class_weight in ("auto" , "balanced" ):
1613+ class_weight = compute_class_weight (class_weight ,
1614+ np .arange (len (self .classes_ )),
1615+ y )
1616+ class_weight = dict (enumerate (class_weight ))
16221617
16231618 path_func = delayed (_log_reg_scoring_path )
16241619
@@ -1638,7 +1633,7 @@ def fit(self, X, y, sample_weight=None):
16381633 max_squared_sum = max_squared_sum ,
16391634 sample_weight = sample_weight
16401635 )
1641- for label in iter_labels
1636+ for label in iter_encoded_labels
16421637 for train , test in folds )
16431638
16441639 if self .multi_class == 'multinomial' :
@@ -1669,9 +1664,9 @@ def fit(self, X, y, sample_weight=None):
16691664 self .n_iter_ = np .reshape (n_iter_ , (n_classes , len (folds ),
16701665 len (self .Cs_ )))
16711666
1672- self .coefs_paths_ = dict (zip (labels , coefs_paths ))
1667+ self .coefs_paths_ = dict (zip (classes , coefs_paths ))
16731668 scores = np .reshape (scores , (n_classes , len (folds ), - 1 ))
1674- self .scores_ = dict (zip (labels , scores ))
1669+ self .scores_ = dict (zip (classes , scores ))
16751670
16761671 self .C_ = list ()
16771672 self .coef_ = np .empty ((n_classes , X .shape [1 ]))
@@ -1682,10 +1677,14 @@ def fit(self, X, y, sample_weight=None):
16821677 scores = multi_scores
16831678 coefs_paths = multi_coefs_paths
16841679
1685- for index , label in enumerate (iter_labels ):
1680+ for index , (cls , encoded_label ) in enumerate (
1681+ zip (iter_classes , iter_encoded_labels )):
1682+
16861683 if self .multi_class == 'ovr' :
1687- scores = self .scores_ [label ]
1688- coefs_paths = self .coefs_paths_ [label ]
1684+ # The scores_ / coefs_paths_ dict have unencoded class
1685+ # labels as their keys
1686+ scores = self .scores_ [cls ]
1687+ coefs_paths = self .coefs_paths_ [cls ]
16891688
16901689 if self .refit :
16911690 best_index = scores .sum (axis = 0 ).argmax ()
@@ -1698,8 +1697,10 @@ def fit(self, X, y, sample_weight=None):
16981697 else :
16991698 coef_init = np .mean (coefs_paths [:, best_index , :], axis = 0 )
17001699
1700+ # Note that y is label encoded and hence pos_class must be
1701+ # the encoded label / None (for 'multinomial')
17011702 w , _ , _ = logistic_regression_path (
1702- X , y , pos_class = label , Cs = [C_ ], solver = self .solver ,
1703+ X , y , pos_class = encoded_label , Cs = [C_ ], solver = self .solver ,
17031704 fit_intercept = self .fit_intercept , coef = coef_init ,
17041705 max_iter = self .max_iter , tol = self .tol ,
17051706 penalty = self .penalty , copy = False ,
0 commit comments