Commit d81da020 authored by Antoine Guillaume's avatar Antoine Guillaume

Applying scikit-learn standards to custom wrappers

parent 0569d929
......@@ -31,8 +31,8 @@ from sklearn.preprocessing import MinMaxScaler
# In[3]:
#Base path, all necessary folders are supposed to be contained in this one.
base_path = r"!! REPLACE BY YOUR PATH !!"
#base_path = r"!! REPLACE BY YOUR PATH !!"
base_path = r"C:/Utilisateurs/A694772/Documents/ECMLPKDD_datacopy/"
#Path to the life cycles CSV files.
dataset_path = base_path+r"datasets/"
......@@ -40,7 +40,8 @@ dataset_path = base_path+r"datasets/"
result_path = base_path+r"results/"
#If not None, CSV files containing data used by the TS-CHIEF java program will be outputed
TSCHIEF_path = dataset_path+r"TSCHIEF/"
#TSCHIEF_path = dataset_path+r"TSCHIEF/"
TSCHIEF_path = None
#If True, perform cross validation of all defined pipelines
do_cross_validation = True
......@@ -58,14 +59,14 @@ predictive_padding_hours = 48
extended_infected_interval_hours = 24
#Size of the PAA transform output
#Number of cross validation splits
# Number of process to launch in parallel for cross validation of each pipeline.
# Set to None if you don't have the setup to allow such speedups.
if dataset_path is not None and not exists(dataset_path):
......@@ -34,8 +34,6 @@ Configuration parameters are located at the beginning of CV_script, you MUST cha
To change or check the algorithms parameters, they all are redefined in custom wrapper classes to avoid errors, if a parameter is not specified in the constructor, it is left as default.
The representations methods are defined inside utils.representations and the classifications methods inside utils.classifications.
ResNet is left commented in the code, so you can run the other algorithms without a Tensorflow installation or a GPU without any impact.
## Usage
Extract the files of the dataset archive located in ~/datasets in the dataset folder
......@@ -58,6 +56,25 @@ The runtime of this script is extremely long, one iteration take about 4 hours,
## Note on using sktime-dl for InceptionTime and ResNet
Both InceptionTime and ResNet are left commented in the code, so you can run the other algorithms without a Tensorflow installation or a GPU without any impact.
Depending on your installation, you might run into errors while feeding tensorflow models in a cross validation pipeline from scikit-learn. Some of those issues can be fixed by making the wrapper for those models defined in utils.classifications inheriting the KerasClassifier wrapper from tensorflow.
To make those two algorithms part of the experiments, you have to uncomment both their declaration in utils.classifications and the associated pipeline in CV_script.
About InceptionTime : sktime-dl is the package dedicated for deep learning built by the sktime authors, still being in active development at time of writing, we add to make some modifications to the source code to be able to run InceptionTime.
From the latest version available on github we applied the following modification :
* Fix import error from sktime utils : In sktime_dl/utils/, replace :
from sktime.utils.data_container import tabularize, from_3d_numpy_to_nested ( line 6)
from sktime.utils.data_container import tabularize, from_3d_numpy_to_nested ( line 6)
* We also modified InceptionTime to use binary_crossentropy (change loss name and use sigmod layer with 1 neuron as an output) and weighted accuracy for early stopping. This is not mandatory but is more suited to our problem.
## Contributing
......@@ -10,18 +10,16 @@ from sklearn.linear_model import RidgeClassifier
from sktime.classification.interval_based import TimeSeriesForest
from sktime.utils.data_container import concat_nested_arrays as cna
from sktime.classification.frequency_based import RandomIntervalSpectralForest
from sklearn.base import BaseEstimator, ClassifierMixin
# # Define classes for classification methods
# # Define classes for representation methods
# Here we define custom classes when necessary for the representation methods we will use inside pipelines during cross validation.
# Here we define custom classes when necessary for the classification methods we will use inside pipelines during cross validation.
# See corresponding modules documentation for documentation.
# Pyts :
# MatrixProfile :
# sktime :
# sklearn :
......@@ -31,10 +29,10 @@ from sktime.classification.frequency_based import RandomIntervalSpectralForest
# In[10]:
#This section is left commented so you have no trouble running the script without Tensorflow/GPU
#If you have error during cross validation, you can try to make the class ResNetV2
# This section is left commented so you have no trouble running the script without Tensorflow/GPU
# While using ResNet, if you have error during cross validation, you can try to make the class ResNetV2
# inherit the tensorflow.keras KerasClassifier wrapper, it can fix some issues.
# Don't forget to uncomment pipelines in CV_scripts aswell.
# Don't forget to uncomment pipelines using ResNet in CV_scripts aswell.
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.optimizers import Adam
......@@ -42,7 +40,8 @@ from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_class_weight
class ResNetV2:
class ResNetV2(BaseEstimator, ClassifierMixin):
def __init__(self, loss='binary_crossentropy', pooling='avg', optimizer=Adam(lr=1e-4)):
self.loss = loss
self.pooling = pooling
......@@ -65,9 +64,9 @@ class ResNetV2:
self.init_model((X.shape[1], X.shape[2], X.shape[3]))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size)
el = EarlyStopping(monitor='val_accuracy', patience=el_patience, restore_best_weights=True, mode='max')
cw = compute_class_weight('balanced',np.unique(y_train), y_train)
cw = compute_class_weight('balanced', np.unique(y_train), y_train)
history =
X_train, y_train,
......@@ -77,35 +76,73 @@ class ResNetV2:
if return_hist:
return history
return self
def predict(self, X):
return np.array([x>0.5 for x in self.model.predict(X)]).astype(int)
def predict_proba(self,X):
return self.model.predict(X)
from sktime_dl.deeplearning.inceptiontime._classifier import InceptionTimeClassifier
class RISE:
def __init__(self, min_length=5, n_estimators=300):
self.estimator = RandomIntervalSpectralForest(n_estimators=n_estimators, min_interval=min_length)
class InceptionTime(BaseEstimator, ClassifierMixin):
def __init__(self, depth=18, nb_filters=32, bottleneck_size=32):
self.model = None
self.depth = depth
self.nb_filters = nb_filters
self.bottleneck_size = bottleneck_size
def fit(self, X, y, epochs=1500, batch_size=32,
el_patience=100, verbose=False, val_size=0.1):
self.model = InceptionTimeClassifier(verbose=verbose, depth=self.depth,
nb_filters=self.nb_filters, bottleneck_size=self.bottleneck_size,
callbacks=[EarlyStopping(monitor='val_accuracy', patience=el_patience,
restore_best_weights=True, mode='max')])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size), y_train, validation_X=X_val,validation_y=y_val)
return self
def predict(self, X):
return np.array([x>0.5 for x in self.model.predict(X)]).astype(int)
def predict_proba(self,X):
return self.model.predict(X)
class SktimeEstimator:
def _sktime_format(self,X,y):
# X : (n_instance, n_timestamp, n_features)
X, y = cna(X.reshape(X.shape[2],X.shape[0],X.shape[1])), np.asarray(y)
X, y = self._sktime_format_X(X), np.asarray(y)
return X, y
def set_params(self, **parameters):
return self
def _sktime_format_X(self,X):
# X : (n_instance, n_timestamp, n_features)
return cna(X.reshape(X.shape[2],X.shape[0],X.shape[1]))
class PytsEstimator:
def _format(self,X,y):
return self._format_X(X), np.asarray(y)
def _format_X(self,X):
return X.reshape(X.shape[0],X.shape[1])
class RISE(BaseEstimator, ClassifierMixin, SktimeEstimator):
def __init__(self, min_length=5, n_estimators=300):
self.min_length = min_length
self.n_estimators = n_estimators
self.estimator = None
def fit(self,X,y):
X, y = self._sktime_format(X,y)
self.estimator = RandomIntervalSpectralForest(n_estimators=self.n_estimators, min_interval=self.min_length),y)
return self
def predict(self,X):
X = self._sktime_format_X(X)
......@@ -115,19 +152,26 @@ class RISE:
X = self._sktime_format(X)
return self.estimator.predict_proba(X)
class Random_Forest:
class Random_Forest(BaseEstimator, ClassifierMixin):
def __init__(self, n_estimators=300, max_depth=None, max_features=0.75, max_samples=0.75,
ccp_alpha=0.0225, class_weight="balanced_subsample"):
self.estimator = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
max_features=max_features, max_samples=max_samples,
self.estimator = None
def set_params(self, **params):
return self.estimator.set_params(**params)
def fit(self,X,y):
def fit(self, X, y):
X = np.asarray([x.astype(np.float32) for x in X])
self.estimator = RandomForestClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth,
max_features=self.max_features, max_samples=self.max_samples,
return self
def predict(self,X):
X = np.asarray([x.astype(np.float32) for x in X])
......@@ -137,15 +181,17 @@ class Random_Forest:
X = np.asarray([x.astype(np.float32) for x in X])
return self.estimator.predict_proba(X)
class KNN_classif:
class KNN_classif(BaseEstimator, ClassifierMixin):
def __init__(self, n_neighbors=9, weights='distance',p=2):
self.estimator = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
def set_params(self, **params):
return self.estimator.set_params(**params)
self.n_neighbors = n_neighbors
self.weights = weights
self.p = p
self.estimator = None
def fit(self,X,y):
def fit(self,X,y):
self.estimator = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, p=self.p),y)
return self
def predict(self,X):
return self.estimator.predict(X)
......@@ -153,27 +199,19 @@ class KNN_classif:
def predict_proba(self,X):
return self.estimator.predict_proba(X)
class TimeSeries_Forest:
class TimeSeries_Forest(BaseEstimator, ClassifierMixin, SktimeEstimator):
def __init__(self, n_estimators=300, min_interval=3):
self.estimator = TimeSeriesForest(n_estimators=n_estimators,
def set_params(self, **params):
return self.estimator.set_params(**params)
def _sktime_format(self,X,y):
# X : (n_instance, n_timestamp, n_features)
X, y = cna(X.reshape(X.shape[2],X.shape[0],X.shape[1])), np.asarray(y)
return X, y
def _sktime_format_X(self,X):
# X : (n_instance, n_timestamp, n_features)
return cna(X.reshape(X.shape[2],X.shape[0],X.shape[1]))
self.n_estimators = n_estimators
self.min_interval = min_interval
self.estimator = None
def fit(self,X,y):
X, y = self._sktime_format(X,y)
self.estimator = TimeSeriesForest(n_estimators=self.n_estimators,
return self
def predict(self,X):
X = self._sktime_format_X(X)
......@@ -184,16 +222,23 @@ class TimeSeries_Forest:
return self.estimator.predict_proba(X)
class SVM_classif:
def __init__(self, C=10, kernel='rbf', degree=2, gamma='scale'):
self.estimator = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma,
cache_size=500, class_weight='balanced')
def set_params(self, **params):
return self.estimator.set_params(**params)
class SVM_classif(BaseEstimator, ClassifierMixin):
def __init__(self, C=10, kernel='rbf', degree=2, gamma='scale',
cache_size=500, class_weight='balanced'):
self.C = C
self.kernel = kernel = degree
self.gamma = gamma
self.cache_size = cache_size
self.class_weight = class_weight
self.estimator = None
def fit(self,X,y):
def fit(self,X,y):
self.estimator = SVC(C=self.C, kernel=self.kernel,,
gamma=self.gamma, cache_size=self.cache_size,
return self
def predict(self,X):
return self.estimator.predict(X)
......@@ -201,17 +246,26 @@ class SVM_classif:
def predict_proba(self,X):
return self.estimator.predict_proba(X)
class Ridge_classif:
class Ridge_classif(BaseEstimator, ClassifierMixin):
def __init__(self, alpha=10.0, normalize=False, copy_X=True, max_iter=None, tol=0.001,
self.estimator = RidgeClassifier(alpha=alpha, normalize=normalize, copy_X=copy_X,
max_iter=max_iter, tol=tol, class_weight=class_weight)
self.alpha = alpha
self.normalize = normalize
self.copy_X = copy_X
self.max_iter = max_iter
self.tol = tol
self.class_weight = class_weight
self.estimator = None
def set_params(self, **params):
return self.estimator.set_params(**params)
def fit(self,X,y):
def fit(self,X,y):
self.estimator = RidgeClassifier(alpha=self.alpha, normalize=self.normalize,
copy_X=self.copy_X, max_iter=self.max_iter,
tol=self.tol, class_weight=self.class_weight),y)
return self
def predict(self,X):
return self.estimator.predict(X)
......@@ -219,22 +273,19 @@ class Ridge_classif:
def predict_proba(self,X):
return self.estimator.predict_proba(X)
class KNN_TS_classif:
class KNN_TS_classif(BaseEstimator, ClassifierMixin, PytsEstimator):
def __init__(self, n_neighbors=9, weights='distance', p=2):
self.estimator = KNeighborsClassifierTS(n_neighbors=n_neighbors, weights=weights, p=p)
def _format(self,X,y):
return X.reshape(X.shape[0],X.shape[1]), np.asarray(y)
def set_params(self, **params):
return self.estimator.set_params(**params)
def _format_X(self,X):
return X.reshape(X.shape[0],X.shape[1])
self.n_neighbors = n_neighbors
self.weights = weights
self.p = p
self.estimator = None
def fit(self,X,y):
X, y = self._format(X,y)
self.estimator = KNeighborsClassifierTS(n_neighbors=self.n_neighbors,
weights=self.weights, p=self.p),y)
return self
def predict(self,X):
X = self._format_X(X)
......@@ -245,29 +296,32 @@ class KNN_TS_classif:
return self.estimator.predict_proba(X)
class BOSSVS_classif:
class BOSSVS_classif(BaseEstimator, ClassifierMixin, PytsEstimator):
def __init__(self, word_size=9, n_bins=7, window_size=0.2, window_step=1,
anova=True, drop_sum=False, norm_mean=False, norm_std=False,
strategy='uniform', alphabet=None):
self.estimator = BOSSVS(word_size=word_size, n_bins=n_bins,
window_size=window_size, window_step=window_step,
anova=anova, drop_sum=drop_sum,
norm_mean=norm_mean, norm_std=norm_std,
strategy=strategy, alphabet=alphabet)
def set_params(self, **params):
return self.estimator.set_params(**params)
self.word_size = word_size
self.n_bins = n_bins
self.window_size = window_size
self.window_step = window_step
self.anova = anova
self.drop_sum = drop_sum
self.norm_mean = norm_mean
self.norm_std = norm_std
self.strategy = strategy
self.alphabet = alphabet
self.estimator = None
def _format(self,X,y):
# X : (n_instance, n_timestamp, n_features)
return X.reshape(X.shape[0],X.shape[1]), np.asarray(y)
def _format_X(self,X):
# X : (n_instance, n_timestamp, n_features)
return X.reshape(X.shape[0],X.shape[1])
def fit(self,X,y):
def fit(self,X,y):
X, y = self._format(X,y)
self.estimator = BOSSVS(word_size=self.word_size, n_bins=self.n_bins,
window_size=self.window_size, window_step=self.window_step,
anova=self.anova, drop_sum=self.drop_sum,
norm_mean=self.norm_mean, norm_std=self.norm_std,
strategy=self.strategy, alphabet=self.alphabet),y)
return self
def predict(self,X):
X = self._format_X(X)
......@@ -9,6 +9,7 @@ from pyts.approximation import PiecewiseAggregateApproximation, SymbolicAggregat
from pyts.transformation import ROCKET
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
# # Define classes for representation methods
# Here we define custom classes when necessary for the representation methods we will use inside pipelines during cross validation.
......@@ -18,25 +19,21 @@ from matplotlib import pyplot as plt
# Pyts :
# MatrixProfile :
# sktime :
# sklearn :
# In[2]:
#Gramian natively use PAA, reccurence don't,
#that's why you'll see calls to PAA inside the Recurrence class but not in the Gramian
class Gramian_transform:
class Gramian_transform(BaseEstimator, TransformerMixin):
def __init__(self, img_size=128, flatten=False, method='s'):
self.img_size = img_size
self.flatten = flatten
self.method = method
self.cmap = plt.get_cmap('jet')
self.transformer = GramianAngularField(image_size=img_size,
def transform(self,X):
self.transformer = None
def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X])
......@@ -48,24 +45,22 @@ class Gramian_transform:
X = self.cmap(X)[:,:,:,:,0:3].reshape(X.shape[0],self.img_size, self.img_size,3)
return X
def set_params(self, **params):
return self.transformer.set_params(**params)
def fit(self, X, y=None):
self.transformer = GramianAngularField(image_size=self.img_size,
return self
def fit_transform(self,X,y):
return self.transform(X)
class Recurrence_transform:
class Recurrence_transform(BaseEstimator, TransformerMixin):
def __init__(self, output_size=128, dimension=1, time_delay=6, flatten=False):
self.output_size = output_size
self.dimension = dimension
self.time_delay = time_delay
self.cmap = plt.get_cmap('jet')
self.approximator = PiecewiseAggregateApproximation(output_size=output_size,
self.transformer = RecurrencePlot(dimension=dimension,
def transform(self,X):
self.transformer = None
def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X])
......@@ -78,27 +73,23 @@ class Recurrence_transform:
X = self.cmap(X)[:,:,:,:,0:3].reshape(X.shape[0],self.output_size, self.output_size,3)
return X
def set_params(self, **parameters):
for parameter, value in parameters.items():
if parameter == 'output_size':
self.approximator.set_params(**{parameter: value})
setattr(self, parameter, value)
elif parameter in ['dimension','time_delay']:
self.transformer.set_params(**{parameter: value})
setattr(self, parameter, value)
def fit(self, X, y=None):
self.approximator = PiecewiseAggregateApproximation(output_size=self.output_size,
self.transformer = RecurrencePlot(dimension=self.dimension,
return self
def fit_transform(self,X,y):
return self.transform(X)
class PiecewiseApproximation_transform:
class PiecewiseApproximation_transform(BaseEstimator, TransformerMixin):
def __init__(self, output_size=1000, overlapping=False, window_size=None):
self.output_size = output_size
self.transformer = PiecewiseAggregateApproximation(output_size=output_size,
def transform(self,X):
self.overlapping = overlapping
self.window_size = window_size
self.transformer = None
def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X])
......@@ -106,77 +97,79 @@ class PiecewiseApproximation_transform:
X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X
def set_params(self, **params):
return self.transformer.set_params(**params)
def fit_transform(self,X,y):
return self.transform(X)
def fit(self, X, y=None):
self.transformer = PiecewiseAggregateApproximation(output_size=self.output_size,
return self
class SymbolicAggregate_transform:
class SymbolicAggregate_transform(BaseEstimator, TransformerMixin):
def __init__(self, n_bins=7, strategy='uniform', alphabet='ordinal'):
self.transformer = SymbolicAggregateApproximation(n_bins=n_bins, strategy=strategy,
self.n_bins = n_bins
self.strategy = strategy
self.alphabet = alphabet
self.transformer = None
def set_params(self, **params):
return self.transformer.set_params(**params)
def transform(self, X):
def transform(self, X, y=None):
X = np.asarray([self.transformer.transform(x.reshape(1,-1)).astype(float) if np.max(x) - np.min(x) != 0 else np.zeros((1,x.shape[0])) for x in X])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X
def fit_transform(self,X,y):
return self.transform(X)
class SymbolicFourrier_transform:
def fit(self, X, y=None):
self.transformer = SymbolicAggregateApproximation(n_bins=self.n_bins,
return self
class SymbolicFourrier_transform(BaseEstimator, TransformerMixin):
def __init__(self, n_coefs=20, n_bins=7, strategy='uniform', drop_sum=False,
anova=True, norm_mean=True, norm_std=False, alphabet='ordinal'):
self.transformer = SymbolicFourierApproximation(n_coefs=n_coefs, n_bins=n_bins,
strategy=strategy, alphabet=alphabet,
drop_sum=drop_sum, anova=anova,
norm_mean=norm_mean, norm_std=norm_std)
def transform(self,X):
self.n_coefs = n_coefs
self.n_bins = n_bins
self.strategy = strategy
self.alphabet = alphabet
self.drop_sum = drop_sum
self.anova = anova
self.norm_mean = norm_mean
self.norm_std = norm_std
self.transformer = None
def transform(self, X, y=None):
X = np.asarray([self.transformer.transform(x.reshape(1,-1)).astype(float) if np.max(x) - np.min(x) != 0 else np.zeros((1,x.shape[0])) for x in X])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X
def set_params(self, **params):
return self.transformer.set_params(**params)
def fit_transform(self,X,y):
X = X.reshape(X.shape[0],X.shape[1]),y)
return self.transform(X)
def fit(self, X, y=None):
self.transformer = SymbolicFourierApproximation(n_coefs=self.n_coefs, n_bins=self.n_bins,
strategy=self.strategy, alphabet=self.alphabet,
drop_sum=self.drop_sum, anova=self.anova,
norm_mean=self.norm_mean, norm_std=self.norm_std)
return self
class MatrixProfile_transform:
class MatrixProfile_transform(BaseEstimator, TransformerMixin):
def __init__(self, window_size=0.075):
def transform(self, X):
def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X])
X = np.asarray([mp.compute(x.reshape(-1),windows=x.shape[0]*self._window_size)['mp'].reshape(1,-1) for x in X])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X
def fit_transform(self,X,y):
return self.transform(X)
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
def fit(self, X, y=None):
return self
class ROCKET_transform:
class ROCKET_transform(BaseEstimator, TransformerMixin):
def __init__(self, n_kernels=15000, kernel_sizes=(5,7,9), flatten=False):
self.flatten = flatten
self.transformer = ROCKET(n_kernels=n_kernels, kernel_sizes=kernel_sizes)
def set_params(self, **params):
return self.transformer.set_params(**params)
def transform(self,X):
self.n_kernels = n_kernels
self.kernel_sizes = kernel_sizes
self.transformer = None
def transform(self, X, y=None):
X = X.reshape(X.shape[0],X.shape[1])
X = self.transformer.transform(X)
if self.flatten:
......@@ -185,8 +178,8 @@ class ROCKET_transform:
X = X.reshape(X.shape[0], X.shape[1], 1)
return X
def fit_transform(self,X,y):
def fit(self, X, y=None):
self.transformer = ROCKET(n_kernels=self.n_kernels, kernel_sizes=self.kernel_sizes)
X = X.reshape(X.shape[0],X.shape[1])
return self.transform(X)
return self
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment