Source code for assembledopenml.metaflow

from openml.runs import get_run as openml_get_run
from openml.flows import get_flow as openml_get_flow
from assembledopenml.util.data_fetching import fetch_arff_file_to_dataframe
from assembledopenml.util.common import make_equal
from typing import Union, List


[docs]class MetaFlow: def __init__(self, flow_id: int, description: Union[str, None], run_performance: Union[float, None], run_id: int, conf_tol: float = 0.01): """Internal Object of a Flow filled with (for us relevant) metadata about the flow and the run. Parameters ---------- flow_id : int The ID of the flow for which we want to build a metaflow. description: Union[str, None] The description (or name) of the flow. run_performance: Union[float, None] The performance of the run for the flow. run_id: int The id of run of the flow. conf_tol: float The tolerance for incorrect confidence-predictions. Determines in how far we think numerical precision is the reason for a wrong confidence value. """ self.flow_id = flow_id self.description = description self.run_performance = run_performance self.run_id = run_id self.conf_tol = conf_tol # -- init later self.predictions_url = None self.conf_prefix = None self.predictions = None self.confidences = None self.file_ground_truth = None self.name = None # -- flags self.file_ground_truth_corrupted = False self.confidences_corrupted = False self.confidences_fixed = False # -- Post Checks # Automatically fill description if it is None if self.description is None: tmp_f = openml_get_flow(self.flow_id) self.description = "{}({})".format(tmp_f.name, tmp_f.version) @property def is_bad_flow(self): return self.confidences_corrupted and (not self.confidences_fixed) @property def corruption_details(self): return { "file_ground_truth_corrupted": self.file_ground_truth_corrupted, "confidences_corrupted": self.confidences_corrupted, "confidences_fixed": self.confidences_fixed } def _parse_label_col(self, predictions_data): """Read Label Column from Predictions file""" known_label_names = ["correct", "truth"] # Find label column in data for label_name in known_label_names: if label_name in list(predictions_data): return label_name # The case if we did not return (= the case were no known label is in columns) raise RuntimeError("Unknown predictions-like file format. ", "Unable to parse label column for: {} ".format(self.predictions_url)) def _parse_conf_cols(self, predictions_data): # Get columns with confidence values known_confidence_prefixes = ["confidence."] confidence_cols = [] hit = False # Collect conf columns and prefix is in data for conf_prefix in known_confidence_prefixes: for col_name in list(predictions_data): if conf_prefix in col_name: confidence_cols.append(col_name) hit = True if hit: return conf_prefix, confidence_cols raise RuntimeError("Unknown predictions-like file format. ", "Unable to parse confidence columns for: {} ".format(self.predictions_url)) def _confidence_to_predictions(self): # Assumption: highest confidence equals prediction return self.confidences.idxmax(axis=1).apply(lambda x: x[len(self.conf_prefix):]) def _gather_wrong_confidences(self, conf_preds): wrong_confs_mask = conf_preds != self.predictions # Get relevant subsets pred_wrong = self.predictions[wrong_confs_mask] conf_pred_wrong = conf_preds[wrong_confs_mask] confs_wrong = self.confidences[wrong_confs_mask] # Values to check tol_equal = True not_equal_at_all_idx = [] equal_with_tol_idx = [] # Check relevant confidences for idx, pred, conf_pred in zip(pred_wrong.index, pred_wrong, conf_pred_wrong): conf_of_pred = confs_wrong.loc[idx, "{}{}".format(self.conf_prefix, pred)] conf_of_conf_pred = confs_wrong.loc[idx, "{}{}".format(self.conf_prefix, conf_pred)] # Check if equal if conf_of_conf_pred != conf_of_pred: # If not, check if equal within tolerance if not ((conf_of_pred - self.conf_tol) < conf_of_conf_pred < (conf_of_pred + self.conf_tol)): tol_equal = False # even with tolerance, the confidence are not equal not_equal_at_all_idx.append((idx, conf_of_pred)) else: equal_with_tol_idx.append((idx, conf_of_pred)) return tol_equal, not_equal_at_all_idx, equal_with_tol_idx def _validate_confidences(self): """ Check if the confidences accurately reflect the predictions Return True if valid, False if not """ # FIXME, might want to add a check here that all values sum up to 1 conf_preds = self._confidence_to_predictions() # Check if confidences are correct if conf_preds.equals(self.predictions): return # Class-predictions and confidence-predictions are not identical. self.confidences_corrupted = True # Gather data to check if we can fix the corrupted confidences tol_equal, not_equal_at_all_idx, equal_with_tol_idx = self._gather_wrong_confidences(conf_preds) # Check if it is fixable if tol_equal: # Predictions are not identical because the confidence is almost equal. # We assume this is an artifact from randomness in classifiers or precision and ignore it. # We, however, update the values to make them equal for later usage. # Make Data Equal for idx, conf_of_pred in equal_with_tol_idx: self.confidences.loc[idx] = make_equal(conf_of_pred, self.confidences.loc[idx]) self.confidences_fixed = True else: # ----- The Problematic Cases # -- Check if it is a model were probabilities/confidence score is not necessarily representative # of the prediction due to too small datasets or cross validation based proba-calculation. not_rep_conf = [ # small dataset or cross validation make confidence (proba) bad, # see https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html "classifier=sklearn.svm.classes.SVC)", "svc=sklearn.svm.classes.SVC)", "sklearn.svm._classes.SVC)", # Special Case for Adaboost, which can have inaccurate probability results for small datasets # see ,e.g., Adaboost https://stats.stackexchange.com/questions/329080/adaboost-probabilities "classifier=sklearn.ensemble.weight_boosting.AdaBoostClassifier(" ] if any(x in self.description for x in not_rep_conf): # The confidence for the predictions is not representative because of the model's rng. # To solve this, we equalize all too high confidences and the prediction's confidence to_make_equal = equal_with_tol_idx + not_equal_at_all_idx for idx, conf_of_pred in to_make_equal: self.confidences.loc[idx] = make_equal(conf_of_pred, self.confidences.loc[idx]) self.confidences_fixed = True else: # FIXME UNKNOWN REASON FOR BAD FLOW; MIGHT REQUIRE POSTPROCESSING; STORE FOR LATER BUT IGNORE FOR NOW # Potential Reasons # Data uploaded to OpenML is Wrong # Malicious content # Bugs in OpenML Backend or ML Tool Pipeline pass def _validate_predictions(self, class_labels): # -- Check Format of Pred col (make sure it has the same string style as the ground truth col) if set(self.predictions.unique().tolist()) - set(class_labels): # TODO handle this as a special case? raise ValueError("The Prediction Column for Flow {} has the wrong label-name format: {}".format( self.flow_id, self.predictions.unique().tolist()))
[docs] def get_predictions_data(self, class_labels: List[str]): """Fill the metaflow object with predictions data (and other relevant variables) Parameters ---------- class_labels: List[str] The names of each class label """ # -- Load Predictions file self.predictions_url = openml_get_run(self.run_id).predictions_url predictions_data, _ = fetch_arff_file_to_dataframe(self.predictions_url) # -- Parse Predictions data # FIXME we are assuming a default format here. If this is different, we can not work with it and it will crash. # - Parse y_true_col_name from data file y_true_col_name = self._parse_label_col(predictions_data) # - Get predictions in correct format (str decode) and in order of instances (sort_values + reset index) predictions_data = predictions_data.sort_values(by="row_id").reset_index() self.predictions = predictions_data["prediction"].str.decode("utf-8") self.file_ground_truth = predictions_data[y_true_col_name].str.decode("utf-8") self._validate_predictions(class_labels) # - Parse confidence columns self.conf_prefix, conf_cols = self._parse_conf_cols(predictions_data) if (len(conf_cols) % len(class_labels)) != 0: raise ValueError("Too few confidence columns found in predictions file: {} ".format(self.predictions_url), "Expected {} cols for each predictor. ".format(len(class_labels)), "Found at least 1 predictior with too few columns.") # - Get confidence values self.confidences = predictions_data[[self.conf_prefix + n for n in class_labels]].copy() self._validate_confidences()