Open
Description
Hello @eddiebergman, I want to use SVM Classifier with AutoSklearn. After a run time of 60 mins
, I get a DummyClassifier
. Here is my implementation.
import os
import resource
import sys
# Get the directory path containing autosklearn
package_dir = os.path.abspath(os.path.join(os.path.dirname("Fair-AutoML"), "../.."))
# Add the directory to sys.path
sys.path.append(package_dir)
import autosklearn.pipeline.components.classification
import datetime
import pickle
import autosklearn.classification
import autosklearn.metrics
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import sklearn.metrics
import autosklearn.classification
from autosklearn.upgrade.metric import (
disparate_impact,
statistical_parity_difference,
equal_opportunity_difference,
average_odds_difference,
)
from autosklearn.Fairea.fairea import create_baseline
from ConfigSpace import EqualsCondition, InCondition
from autosklearn.pipeline.implementations.util import softmax
from autosklearn.util.common import check_for_bool, check_none
train_list = "data_orig_train_adult.pkl"
test_list = "data_orig_test_adult.pkl"
def custom_preprocessing(df):
def group_race(x):
if x == "White":
return 1.0
else:
return 0.0
# Recode sex and race
df["sex"] = df["sex"].replace({"Female": 0.0, "Male": 1.0})
df["race"] = df["race"].apply(lambda x: group_race(x))
return df
############################################################################
# File Remover
# ============
now = str(datetime.datetime.now())[:19]
now = now.replace(":", "_")
temp_path = "adult_svm_spd" + str(now)
try:
os.remove("test_split.txt")
except:
pass
try:
os.remove("num_keys.txt")
except:
pass
try:
os.remove("beta.txt")
except:
pass
f = open("beta.txt", "w")
f.close()
############################################################################
# Data Loading
# ============
import pandas as pd
from aif360.datasets import StandardDataset
train = pd.read_pickle(train_list)
test = pd.read_pickle(test_list)
na_values = ["?"]
default_mappings = {
"label_maps": [{1.0: ">50K", 0.0: "<=50K"}],
"protected_attribute_maps": [
{1.0: "White", 0.0: "Non-white"},
{1.0: "Male", 0.0: "Female"},
],
}
data_orig_train = StandardDataset(
df=train,
label_name="income-per-year",
favorable_classes=[">50K", ">50K."],
protected_attribute_names=["sex"],
privileged_classes=[[1]],
instance_weights_name=None,
categorical_features=[
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"native-country",
],
features_to_keep=[],
features_to_drop=["income", "native-country", "hours-per-week"],
na_values=na_values,
custom_preprocessing=custom_preprocessing,
metadata=default_mappings,
)
data_orig_test = StandardDataset(
df=test,
label_name="income-per-year",
favorable_classes=[">50K", ">50K."],
protected_attribute_names=["sex"],
privileged_classes=[[1]],
instance_weights_name=None,
categorical_features=[
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"native-country",
],
features_to_keep=[],
features_to_drop=["income", "native-country", "hours-per-week"],
na_values=na_values,
custom_preprocessing=custom_preprocessing,
metadata=default_mappings,
)
privileged_groups = [{"sex": 1}]
unprivileged_groups = [{"sex": 0}]
X_train = data_orig_train.features
y_train = data_orig_train.labels.ravel()
X_test = data_orig_test.features
y_test = data_orig_test.labels.ravel()
from autosklearn.pipeline.components.base import AutoSklearnClassificationAlgorithm
from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
UniformFloatHyperparameter,
CategoricalHyperparameter,
UniformIntegerHyperparameter,
UnParametrizedHyperparameter,
)
from autosklearn.pipeline.constants import (
DENSE,
SPARSE,
SIGNED_DATA,
UNSIGNED_DATA,
PREDICTIONS,
)
# class CustomSVM(AutoSklearnClassificationAlgorithm):
# def __init__(
# self,
# C,
# kernel,
# gamma,
# shrinking,
# probability=True,
# random_state=None,
# ):
# self.C = C
# self.kernel = kernel
# self.gamma = gamma
# self.shrinking = shrinking
# self.probability = probability
# self.random_state = random_state
#
# def fit(self, X, y):
# from sklearn.svm import SVC
#
# self.estimator = SVC(
# C=self.C,
# kernel=self.kernel,
# gamma=self.gamma,
# shrinking=self.shrinking,
# probability=self.probability,
# random_state=self.random_state,
# )
# self.estimator.fit(X, y)
# return self
#
# def predict(self, X):
# if self.estimator is None:
# raise NotImplementedError()
# return self.estimator.predict(X)
#
# def predict_proba(self, X):
# if self.estimator is None:
# raise NotImplementedError()
# return self.estimator.predict_proba(X)
#
# @staticmethod
# def get_properties(dataset_properties=None):
# return {
# "shortname": "SVM",
# "name": "Support Vector Classifier",
# "handles_regression": False,
# "handles_classification": True,
# "handles_multiclass": True,
# "handles_multilabel": False,
# "handles_multioutput": False,
# "is_deterministic": True,
# "input": [DENSE, SPARSE, SIGNED_DATA, UNSIGNED_DATA],
# "output": [PREDICTIONS],
# }
#
# @staticmethod
# def get_hyperparameter_search_space(dataset_properties=None):
# cs = ConfigurationSpace()
#
# C = UniformFloatHyperparameter("C", lower=0.03125, upper=32768.0, log=True, default_value=1.0)
# kernel = CategoricalHyperparameter("kernel", ["linear", "rbf", "poly", "sigmoid"], default_value="rbf")
# gamma = UniformFloatHyperparameter("gamma", lower=3.0517578125e-05, upper=8, log=True, default_value=0.1)
# shrinking = CategoricalHyperparameter("shrinking", [True, False], default_value=True)
#
# cs.add_hyperparameters([C, kernel, gamma, shrinking])
# return cs
class CustomSVC(AutoSklearnClassificationAlgorithm):
def __init__(
self,
C,
kernel,
gamma,
shrinking,
tol,
max_iter,
class_weight=None,
degree=3,
coef0=0,
random_state=42,
):
self.C = C
self.kernel = kernel
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
self.shrinking = shrinking
self.tol = tol
self.class_weight = class_weight
self.max_iter = max_iter
self.random_state = random_state
self.estimator = None
def fit(self, X, Y):
import sklearn.svm
# Calculate the size of the kernel cache (in MB) for sklearn's LibSVM. The cache size is
# calculated as 2/3 of the available memory (which is calculated as the memory limit minus
# the used memory)
try:
# Retrieve memory limits imposed on the process
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
if soft > 0:
# Convert limit to units of megabytes
soft /= 1024 * 1024
# Retrieve memory used by this process
maxrss = resource.getrusage(resource.RUSAGE_SELF)[2] / 1024
# In MacOS, the MaxRSS output of resource.getrusage in bytes; on other platforms,
# it's in kilobytes
if sys.platform == "darwin":
maxrss = maxrss / 1024
cache_size = (soft - maxrss) / 1.5
if cache_size < 0:
cache_size = 200
else:
cache_size = 200
except Exception:
cache_size = 200
self.C = float(self.C)
if self.degree is None:
self.degree = 3
else:
self.degree = int(self.degree)
if self.gamma is None:
self.gamma = 0.0
else:
self.gamma = float(self.gamma)
if self.coef0 is None:
self.coef0 = 0.0
else:
self.coef0 = float(self.coef0)
self.tol = float(self.tol)
self.max_iter = float(self.max_iter)
self.shrinking = check_for_bool(self.shrinking)
if check_none(self.class_weight):
self.class_weight = None
self.estimator = sklearn.svm.SVC(
C=self.C,
kernel=self.kernel,
degree=self.degree,
gamma=self.gamma,
coef0=self.coef0,
shrinking=self.shrinking,
tol=self.tol,
class_weight=self.class_weight,
max_iter=self.max_iter,
random_state=self.random_state,
cache_size=cache_size,
decision_function_shape="ovr",
)
self.estimator.fit(X, Y)
return self
def predict(self, X):
if self.estimator is None:
raise NotImplementedError
return self.estimator.predict(X)
def predict_proba(self, X):
if self.estimator is None:
raise NotImplementedError()
decision = self.estimator.decision_function(X)
return softmax(decision)
@staticmethod
def get_properties(dataset_properties=None):
return {
"shortname": "LibSVM-SVC",
"name": "LibSVM Support Vector Classification",
"handles_regression": False,
"handles_classification": True,
"handles_multiclass": True,
"handles_multilabel": False,
"handles_multioutput": False,
"is_deterministic": True,
"input": (DENSE, SPARSE, UNSIGNED_DATA),
"output": (PREDICTIONS,),
}
# SVC(C=0.85, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
# decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
# max_iter=-1, random_state=42, shrinking=True, tol=0.001, probability=True,
# verbose=False)
@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
C = UniformFloatHyperparameter("C", 0.03175, 8186, log=True, default_value=0.85)
# No linear kernel here, because we have liblinear
kernel = CategoricalHyperparameter(
name="kernel", choices=["rbf", "poly", "sigmoid"], default_value="rbf"
)
degree = UniformIntegerHyperparameter("degree", 3, 4, default_value=3)
gamma = UniformFloatHyperparameter(
"gamma", 3.3853109172452256e-05, 1.56664, log=True, default_value=0.1
)
# TODO this is totally ad-hoc
coef0 = UniformFloatHyperparameter(
"coef0", -0.12737, 0.67005, default_value=0.0
)
# probability is no hyperparameter, but an argument to the SVM algo
shrinking = CategoricalHyperparameter(
"shrinking", ["True", "False"], default_value="True"
)
tol = UniformFloatHyperparameter(
"tol", 1.0380433896731804e-05, 0.02999, default_value=1e-3, log=True
)
# cache size is not a hyperparameter, but an argument to the program!
max_iter = UnParametrizedHyperparameter("max_iter", -1)
cs = ConfigurationSpace()
cs.add_hyperparameters(
[C, kernel, degree, gamma, coef0, shrinking, tol, max_iter]
)
degree_depends_on_poly = EqualsCondition(degree, kernel, "poly")
coef0_condition = InCondition(coef0, kernel, ["poly", "sigmoid"])
cs.add_condition(degree_depends_on_poly)
cs.add_condition(coef0_condition)
return cs
autosklearn.pipeline.components.classification.add_classifier(CustomSVC)
cs = CustomSVC.get_hyperparameter_search_space()
print(cs)
############################################################################
# Custom metrics definition
# =========================
def accuracy(solution, prediction):
metric_id = 2
protected_attr = "sex"
with open("test_split.txt") as f:
first_line = f.read().splitlines()
last_line = first_line[-1]
split = list(last_line.split(","))
for i in range(len(split)):
split[i] = int(split[i])
subset_data_orig_train = data_orig_train.subset(split)
if os.stat("beta.txt").st_size == 0:
default = XGBClassifier(
learning_rate=0.35,
n_estimator=200,
max_depth=6,
subsample=1,
min_child_weight=1,
)
degrees = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
mutation_strategies = {"0": [1, 0], "1": [0, 1]}
dataset_orig = subset_data_orig_train
res = create_baseline(
default,
dataset_orig,
privileged_groups,
unprivileged_groups,
data_splits=10,
repetitions=10,
odds=mutation_strategies,
options=[0, 1],
degrees=degrees,
)
acc0 = np.array(
[np.mean([row[0] for row in res["0"][degree]]) for degree in degrees]
)
acc1 = np.array(
[np.mean([row[0] for row in res["1"][degree]]) for degree in degrees]
)
fair0 = np.array(
[
np.mean([row[metric_id] for row in res["0"][degree]])
for degree in degrees
]
)
fair1 = np.array(
[
np.mean([row[metric_id] for row in res["1"][degree]])
for degree in degrees
]
)
if min(acc0) > min(acc1):
beta = (max(acc0) - min(acc0)) / (max(acc0) - min(acc0) + max(fair0))
else:
beta = (max(acc1) - min(acc1)) / (max(acc1) - min(acc1) + max(fair1))
f = open("beta.txt", "w")
f.write(str(beta))
f.close()
else:
f = open("beta.txt", "r")
beta = float(f.read())
f.close()
# print('yyyy')
# print(beta)
beta += 0.2
if beta > 1.0:
beta = 1.0
try:
num_keys = sum(1 for line in open("num_keys.txt"))
print(num_keys)
beta -= 0.050 * int(int(num_keys) / 10)
if beta < 0.0:
beta = 0
if int(num_keys) % 10 == 0:
os.remove(temp_path + "/.auto-sklearn/ensemble_read_losses.pkl")
f.close()
except FileNotFoundError:
pass
fairness_metrics = [
1 - np.mean(solution == prediction),
disparate_impact(subset_data_orig_train, prediction, protected_attr),
statistical_parity_difference(
subset_data_orig_train, prediction, protected_attr
),
equal_opportunity_difference(
subset_data_orig_train, prediction, solution, protected_attr
),
average_odds_difference(
subset_data_orig_train, prediction, solution, protected_attr
),
]
print(
fairness_metrics[metric_id],
1 - np.mean(solution == prediction),
fairness_metrics[metric_id] * beta
+ (1 - np.mean(solution == prediction)) * (1 - beta),
beta,
)
return fairness_metrics[metric_id] * beta + (
1 - np.mean(solution == prediction)
) * (1 - beta)
############################################################################
# Second example: Use own accuracy metric
# =======================================
print("#" * 80)
print("Use self defined accuracy metric")
accuracy_scorer = autosklearn.metrics.make_scorer(
name="fair+acc",
score_func=accuracy,
optimum=1,
greater_is_better=False,
needs_proba=False,
needs_threshold=False,
)
############################################################################
# Build and fit a classifier
# ==========================
automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60 * 60,
memory_limit=10000000,
include_estimators=["CustomSVC"],
ensemble_size=1,
# include_preprocessors=[
# "select_percentile_classification",
# "extra_trees_preproc_for_classification",
# "select_rates_classification",
# ],
tmp_folder=temp_path,
delete_tmp_folder_after_terminate=False,
metric=accuracy_scorer,
)
automl.fit(X_train, y_train)
print(automl.show_models())
cs = automl.get_configuration_space(X_train, y_train)
a_file = open("adult_spd_spd_60sp" + str(now) + ".pkl", "wb")
pickle.dump(automl.cv_results_, a_file)
a_file.close()
a_file1 = open("automl_adult_spd_spd_60sp" + str(now) + ".pkl", "wb")
pickle.dump(automl, a_file1)
a_file1.close()
predictions = automl.predict(X_test)
print(predictions)
print(y_test, len(predictions))
print("SPD-Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))
print(disparate_impact(data_orig_test, predictions, "sex"))
print(statistical_parity_difference(data_orig_test, predictions, "sex"))
print(equal_opportunity_difference(data_orig_test, predictions, y_test, "sex"))
print(average_odds_difference(data_orig_test, predictions, y_test, "sex"))
from sklearn.metrics import precision_score, recall_score, f1_score
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))
print("F1 score:", f1_score(y_test, predictions))
import json
from utils.file_ops import write_file
from utils.run_history import _get_run_history
write_file(
"./run_history/adult_svm_spd_sex_run_history.json",
json.dumps(_get_run_history(automl_model=automl), indent=4),
)
There are two implementation of SVC. Both of them are not able to find a best model for SVC Classifier, so it is returning dummy_classifier
. But I am able to view the run_history
.
Metadata
Metadata
Assignees
Labels
No labels