EducationalTestingService · tamarl08 · Nov 7, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,3 +8,4 @@ select = ["D", "E", "F", "I"]
 ignore = ["D212"]
 line-length = 100
 target-version = "py38"
+fix = true
diff --git a/skll/config/__init__.py b/skll/config/__init__.py
@@ -18,7 +18,7 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import numpy as np
-import ruamel.yaml as yaml
+from ruamel.yaml import YAML
 
 from skll.data.readers import safe_float
 from skll.types import ClassMap, FoldMapping, LabelType, PathOrStr
@@ -610,7 +610,9 @@ def parse_config_file(
  raise ValueError(
  "Configuration file does not contain list of learners " "in [Input] section."
  )
- learners = yaml.safe_load(fix_json(learners_string))
+
+ yaml = YAML(typ="safe", pure=True)
+ learners = yaml.load(fix_json(learners_string))
 
  if len(learners) == 0:
  raise ValueError(
@@ -630,7 +632,7 @@ def parse_config_file(
  custom_metric_path = locate_file(config.get("Input", "custom_metric_path"), config_dir)
 
  # get the featuresets
- featuresets = yaml.safe_load(config.get("Input", "featuresets"))
+ featuresets = yaml.load(config.get("Input", "featuresets"))
 
  # ensure that featuresets is either a list of features or a list of lists
  # of features
@@ -641,7 +643,7 @@ def parse_config_file(
  f"specified: {featuresets}"
  )
 
- featureset_names = yaml.safe_load(fix_json(config.get("Input", "featureset_names")))
+ featureset_names = yaml.load(fix_json(config.get("Input", "featureset_names")))
 
  # ensure that featureset_names is a list of strings, if specified
  if featureset_names:
@@ -658,7 +660,7 @@ def parse_config_file(
  # learners. If it's not specified, then we just assume
  # that we are using 10 folds for each learner.
  learning_curve_cv_folds_list_string = config.get("Input", "learning_curve_cv_folds_list")
- learning_curve_cv_folds_list = yaml.safe_load(fix_json(learning_curve_cv_folds_list_string))
+ learning_curve_cv_folds_list = yaml.load(fix_json(learning_curve_cv_folds_list_string))
  if len(learning_curve_cv_folds_list) == 0:
  learning_curve_cv_folds_list = [10] * len(learners)
  else:
@@ -679,7 +681,7 @@ def parse_config_file(
  # floats (proportions). If it's not specified, then we just
  # assume that we are using np.linspace(0.1, 1.0, 5).
  learning_curve_train_sizes_string = config.get("Input", "learning_curve_train_sizes")
- learning_curve_train_sizes = yaml.safe_load(fix_json(learning_curve_train_sizes_string))
+ learning_curve_train_sizes = yaml.load(fix_json(learning_curve_train_sizes_string))
  if len(learning_curve_train_sizes) == 0:
  learning_curve_train_sizes = np.linspace(0.1, 1.0, 5).tolist()
  else:
@@ -698,9 +700,9 @@ def parse_config_file(
  # do we need to shuffle the training data
  do_shuffle = config.getboolean("Input", "shuffle")
 
- fixed_parameter_list = yaml.safe_load(fix_json(config.get("Input", "fixed_parameters")))
- fixed_sampler_parameters = yaml.safe_load(fix_json(config.get("Input", "sampler_parameters")))
- param_grid_list = yaml.safe_load(fix_json(config.get("Tuning", "param_grids")))
+ fixed_parameter_list = yaml.load(fix_json(config.get("Input", "fixed_parameters")))
+ fixed_sampler_parameters = yaml.load(fix_json(config.get("Input", "sampler_parameters")))
+ param_grid_list = yaml.load(fix_json(config.get("Tuning", "param_grids")))
 
  # read and normalize the value of `pos_label`
  pos_label_string = safe_float(config.get("Tuning", "pos_label"))
@@ -804,7 +806,8 @@ def parse_config_file(
 
  # Get class mapping dictionary if specified
  class_map_string = config.get("Input", "class_map")
- original_class_map = yaml.safe_load(fix_json(class_map_string))
+ yaml = YAML(typ="safe", pure=True)
+ original_class_map = yaml.load(fix_json(class_map_string))
  if original_class_map:
  # Change class_map to map from originals to replacements instead of
  # from replacement to list of originals

diff --git a/skll/config/utils.py b/skll/config/utils.py
@@ -13,7 +13,7 @@
 from pathlib import Path
 from typing import Iterable, List, Union
 
-import ruamel.yaml as yaml
+from ruamel.yaml import YAML
 
 from skll.types import FoldMapping, PathOrStr
 
@@ -186,7 +186,8 @@ def _parse_and_validate_metrics(metrics: str, option_name: str, logger=None) ->
 
  # make sure the given metrics data type is a list
  # and parse it correctly
- metrics = yaml.safe_load(fix_json(metrics))
+ yaml = YAML(typ="safe", pure=True)
+ metrics = yaml.load(fix_json(metrics))
  if not isinstance(metrics, list):
  raise TypeError(f"{option_name} should be a list, not a " f"{type(metrics)}.")
 

diff --git a/skll/experiments/output.py b/skll/experiments/output.py
@@ -22,8 +22,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import ruamel.yaml as yaml
 import seaborn as sns
+from ruamel.yaml import YAML
 
 from skll.types import FoldMapping, PathOrStr
 from skll.utils.logging import get_skll_logger
@@ -638,6 +638,8 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla
  # Map from feature set names to all features in them
  all_features = defaultdict(set)
  logger = get_skll_logger("experiment")
+ yaml = YAML(typ="safe", pure=True)
+
  for json_path_str in result_json_paths:
  json_path = Path(json_path_str)
  if not json_path.exists():
@@ -654,7 +656,7 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla
  featureset_name = obj[0]["featureset_name"]
  if ablation != 0 and "_minus_" in featureset_name:
  parent_set = featureset_name.split("_minus_", 1)[0]
- all_features[parent_set].update(yaml.safe_load(obj[0]["featureset"]))
+ all_features[parent_set].update(yaml.load(obj[0]["featureset"]))
  learner_result_dicts.extend(obj)
 
  # Build and write header
@@ -670,9 +672,7 @@ def _write_summary_file(result_json_paths: List[str], output_file: IO[str], abla
  featureset_name = lrd["featureset_name"]
  if ablation != 0:
  parent_set = featureset_name.split("_minus_", 1)[0]
- ablated_features = all_features[parent_set].difference(
- yaml.safe_load(lrd["featureset"])
- )
+ ablated_features = all_features[parent_set].difference(yaml.load(lrd["featureset"]))
  lrd["ablated_features"] = ""
  if ablated_features:
  lrd["ablated_features"] = json.dumps(sorted(ablated_features))