@@ -204,7 +204,6 @@ def test_featurevalidator_supported_types(input_data_featuretest):
204204 assert sparse .issparse (transformed_X )
205205 else :
206206 assert isinstance (transformed_X , np .ndarray )
207- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
208207 assert np .issubdtype (transformed_X .dtype , np .number )
209208 assert validator ._is_fitted
210209
@@ -237,9 +236,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
237236 validator .fit (input_data_featuretest )
238237 transformed_X = validator .transform (input_data_featuretest )
239238 assert any (pd .isna (input_data_featuretest ))
240- assert any ((- 1 in categories ) or ('-1' in categories ) or ('Missing!' in categories ) for categories in
241- validator .encoder .named_transformers_ ['encoder' ].categories_ )
242- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
239+ categories_ = validator .column_transformer .\
240+ named_transformers_ ['categorical_pipeline' ].named_steps ['onehotencoder' ].categories_
241+ assert any (('0' in categories ) or (0 in categories ) or ('missing_value' in categories ) for categories in
242+ categories_ )
243243 assert np .issubdtype (transformed_X .dtype , np .number )
244244 assert validator ._is_fitted
245245 assert isinstance (transformed_X , np .ndarray )
@@ -292,7 +292,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
292292 else :
293293 raise ValueError (type (input_data_featuretest ))
294294 transformed_X = validator .transform (complementary_type )
295- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
296295 assert np .issubdtype (transformed_X .dtype , np .number )
297296 assert validator ._is_fitted
298297
@@ -436,36 +435,29 @@ def test_features_unsupported_calls_are_raised():
436435 expected
437436 """
438437 validator = TabularFeatureValidator ()
439- with pytest .raises (ValueError , match = r"AutoPyTorch does not support time " ):
438+ with pytest .raises (TypeError , match = r".*?Convert the time information to a numerical value " ):
440439 validator .fit (
441440 pd .DataFrame ({'datetime' : [pd .Timestamp ('20180310' )]})
442441 )
442+ validator = TabularFeatureValidator ()
443443 with pytest .raises (ValueError , match = r"AutoPyTorch only supports.*yet, the provided input" ):
444444 validator .fit ({'input1' : 1 , 'input2' : 2 })
445- with pytest .raises (ValueError , match = r"has unsupported dtype string" ):
445+ validator = TabularFeatureValidator ()
446+ with pytest .raises (TypeError , match = r".*?but input column A has an invalid type `string`.*" ):
446447 validator .fit (pd .DataFrame ([{'A' : 1 , 'B' : 2 }], dtype = 'string' ))
448+ validator = TabularFeatureValidator ()
447449 with pytest .raises (ValueError , match = r"The feature dimensionality of the train and test" ):
448450 validator .fit (X_train = np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]),
449451 X_test = np .array ([[1 , 2 , 3 , 4 ], [4 , 5 , 6 , 7 ]]),
450452 )
453+ validator = TabularFeatureValidator ()
451454 with pytest .raises (ValueError , match = r"Cannot call transform on a validator that is not fit" ):
452455 validator .transform (np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]))
453456
454457
455458@pytest .mark .parametrize (
456459 'input_data_featuretest' ,
457460 (
458- 'numpy_numericalonly_nonan' ,
459- 'numpy_numericalonly_nan' ,
460- 'pandas_numericalonly_nonan' ,
461- 'pandas_numericalonly_nan' ,
462- 'list_numericalonly_nonan' ,
463- 'list_numericalonly_nan' ,
464- # Category in numpy is handled via feat_type
465- 'numpy_categoricalonly_nonan' ,
466- 'numpy_mixed_nonan' ,
467- 'numpy_categoricalonly_nan' ,
468- 'numpy_mixed_nan' ,
469461 'sparse_bsr_nonan' ,
470462 'sparse_bsr_nan' ,
471463 'sparse_coo_nonan' ,
@@ -483,14 +475,14 @@ def test_features_unsupported_calls_are_raised():
483475 ),
484476 indirect = True
485477)
486- def test_no_encoder_created (input_data_featuretest ):
478+ def test_no_column_transformer_created (input_data_featuretest ):
487479 """
488480 Makes sure that for numerical only features, no encoder is created
489481 """
490482 validator = TabularFeatureValidator ()
491483 validator .fit (input_data_featuretest )
492484 validator .transform (input_data_featuretest )
493- assert validator .encoder is None
485+ assert validator .column_transformer is None
494486
495487
496488@pytest .mark .parametrize (
@@ -501,18 +493,18 @@ def test_no_encoder_created(input_data_featuretest):
501493 ),
502494 indirect = True
503495)
504- def test_encoder_created (input_data_featuretest ):
496+ def test_column_transformer_created (input_data_featuretest ):
505497 """
506- This test ensures an encoder is created if categorical data is provided
498+ This test ensures an column transformer is created if categorical data is provided
507499 """
508500 validator = TabularFeatureValidator ()
509501 validator .fit (input_data_featuretest )
510502 transformed_X = validator .transform (input_data_featuretest )
511- assert validator .encoder is not None
503+ assert validator .column_transformer is not None
512504
513505 # Make sure that the encoded features are actually encoded. Categorical columns are at
514506 # the start after transformation. In our fixtures, this is also honored prior encode
515- enc_columns , feature_types = validator ._get_columns_to_encode (input_data_featuretest )
507+ cat_columns , _ , feature_types = validator ._get_columns_info (input_data_featuretest )
516508
517509 # At least one categorical
518510 assert 'categorical' in validator .feat_type
@@ -521,20 +513,13 @@ def test_encoder_created(input_data_featuretest):
521513 if np .any ([pd .api .types .is_numeric_dtype (input_data_featuretest [col ]
522514 ) for col in input_data_featuretest .columns ]):
523515 assert 'numerical' in validator .feat_type
524- for i , feat_type in enumerate (feature_types ):
525- if 'numerical' in feat_type :
526- np .testing .assert_array_equal (
527- transformed_X [:, i ],
528- input_data_featuretest [input_data_featuretest .columns [i ]].to_numpy ()
529- )
530- elif 'categorical' in feat_type :
531- np .testing .assert_array_equal (
532- transformed_X [:, i ],
533- # Expect always 0, 1... because we use a ordinal encoder
534- np .array ([0 , 1 ])
535- )
536- else :
537- raise ValueError (feat_type )
516+ # we expect this input to be the fixture 'pandas_mixed_nan'
517+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , - 1. ], [0. , 1. , 1. ]]))
518+ else :
519+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , 1. , 0. ], [0. , 1. , 0. , 1. ]]))
520+
521+ if not all ([feat_type in ['numerical' , 'categorical' ] for feat_type in feature_types ]):
522+ raise ValueError ("Expected only numerical and categorical feature types" )
538523
539524
540525def test_no_new_category_after_fit ():
@@ -566,13 +551,12 @@ def test_unknown_encode_value():
566551 x ['c' ].cat .add_categories (['NA' ], inplace = True )
567552 x .loc [0 , 'c' ] = 'NA' # unknown value
568553 x_t = validator .transform (x )
569- # The first row should have a -1 as we added a new categorical there
570- expected_row = [- 1 , - 41 , - 3 , - 987.2 ]
554+ # The first row should have a 0, 0 as we added a
555+ # new categorical there and one hot encoder marks
556+ # it as all zeros for the transformed column
557+ expected_row = [0.0 , 0.0 , - 0.5584294383572701 , 0.5000000000000004 , - 1.5136598016833485 ]
571558 assert expected_row == x_t [0 ].tolist ()
572559
573- # Notice how there is only one column 'c' to encode
574- assert validator .categories == [list (range (2 )) for i in range (1 )]
575-
576560
577561# Actual checks for the features
578562@pytest .mark .parametrize (
@@ -624,19 +608,20 @@ def test_feature_validator_new_data_after_fit(
624608 assert sparse .issparse (transformed_X )
625609 else :
626610 assert isinstance (transformed_X , np .ndarray )
627- assert np .shape (X_test ) == np .shape (transformed_X )
628611
629612 # And then check proper error messages
630613 if train_data_type == 'pandas' :
631614 old_dtypes = copy .deepcopy (validator .dtypes )
632615 validator .dtypes = ['dummy' for dtype in X_train .dtypes ]
633- with pytest .raises (ValueError , match = r"Changing the dtype of the features after fit" ):
616+ with pytest .raises (ValueError ,
617+ match = r"The dtype of the features must not be changed after fit" ):
634618 transformed_X = validator .transform (X_test )
635619 validator .dtypes = old_dtypes
636620 if test_data_type == 'pandas' :
637621 columns = X_test .columns .tolist ()
638622 X_test = X_test [reversed (columns )]
639- with pytest .raises (ValueError , match = r"Changing the column order of the features" ):
623+ with pytest .raises (ValueError ,
624+ match = r"The column order of the features must not be changed after fit" ):
640625 transformed_X = validator .transform (X_test )
641626
642627
0 commit comments