Commit 304d6f3f authored by Antoine Guillaume's avatar Antoine Guillaume

Adding subwindow approach support

parent e3496cd6
...@@ -12,7 +12,7 @@ from datetime import timedelta ...@@ -12,7 +12,7 @@ from datetime import timedelta
#You should be able to ignore import * warnings safely, otherwise import #You should be able to ignore import * warnings safely, otherwise import
# each class name of both files. # each class name of both files.
from utils.representations import * from utils.representations import *
from utils.classifications import * from utils.classifications import *
...@@ -21,51 +21,55 @@ from sklearn.pipeline import FeatureUnion ...@@ -21,51 +21,55 @@ from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, StratifiedKFold from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer from sklearn.metrics import f1_score, balanced_accuracy_score, make_scorer
from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import MinMaxScaler
# # Params # # Params
# #
# In[3]: # In[3]:
#Base path, all necessary folders are supposed to be contained in this one. #Base path, all necessary folders are supposed to be contained in this one.
base_path = r"!! REPLACE BY YOUR PATH !!" base_path = r"/home/prof/guillaume/"
#Path to the life cycles CSV files. #Path to the life cycles CSV files.
dataset_path = base_path+r"datasets/" dataset_path = base_path+r"datasets/"
#Path where to export the results of cross validation #Path where to export the results of cross validation
result_path = base_path+r"results/" result_path = base_path+r"results/"
#If not None, CSV files containing data used by the TS-CHIEF java program will be outputed #If not None, CSV files containing data used by the TS-CHIEF java program will be outputed
TSCHIEF_path = dataset_path+r"TSCHIEF/" #dataset_path+r"TSCHIEF/"
TSCHIEF_path = None
#If True, perform cross validation of all defined pipelines #If True, perform cross validation of all defined pipelines
do_cross_validation = True do_cross_validation = True
#If not None, will output results of cross validation as a latex file (csv results are still exported) #Number of days to consider at the end of the life cycles. Life cycles shorter than n_days+1 will be dropped (+1 for temporal alignment).
produce_latex = base_path+'results.tex' n_days = 21
#Separator used when procuding csv outputs #Resample frequency. T = minutes
csv_separator = ';' resample_freq = '20T'
#Size of the predicitve padding #Size of the predicitve padding
predictive_padding_hours = 48 predictive_padding_hours = 48
#Extend the infected interval to cover restart process, setting it to 0 will introduce bias. #Extend the infected interval to cover possible restart process.
extended_infected_interval_hours = 24 extended_infected_interval_hours = 24
#Size of the PAA transform output #If not None, will output results of cross validation as a latex file (csv results are still exported)
size=1000 produce_latex = base_path+'results'+str(n_days)+'J_'+resample_freq+'.tex'
#Separator used when procuding csv outputs
csv_separator = ';'
#Number of cross validation splits #Number of cross validation splits
n_splits=10 n_splits=10
# Number of process to launch in parallel for cross validation of each pipeline. # Number of process to launch in parallel for cross validation of each pipeline.
# Set to None if you don't have the setup to allow such speedups. # Set to None if you don't have the setup to allow such speedups.
n_cv_jobs=None n_cv_jobs=-1
if dataset_path is not None and not exists(dataset_path): if dataset_path is not None and not exists(dataset_path):
mkdir(dataset_path) mkdir(dataset_path)
...@@ -73,11 +77,11 @@ if result_path is not None and not exists(result_path): ...@@ -73,11 +77,11 @@ if result_path is not None and not exists(result_path):
mkdir(result_path) mkdir(result_path)
if TSCHIEF_path is not None and not exists(TSCHIEF_path): if TSCHIEF_path is not None and not exists(TSCHIEF_path):
mkdir(TSCHIEF_path) mkdir(TSCHIEF_path)
# # Import data # # Import data
# In this experiment, we consider life cycle data coming from ATMs. Only life cycle of at least that seven days are considered. # In this experiment, we consider life cycle data coming from ATMs. Only life cycle of at least that seven days are considered.
# #
# CSV files are formatted as follow : `Cycle_{}_{}_{}.csv` with in that order : ATM id, life cycle id, state in place of brackets # CSV files are formatted as follow : `Cycle_{}_{}_{}.csv` with in that order : ATM id, life cycle id, state in place of brackets
# In[4]: # In[4]:
...@@ -91,13 +95,13 @@ def process_cycle(file_name, path, predictive_interval, infected_interval): ...@@ -91,13 +95,13 @@ def process_cycle(file_name, path, predictive_interval, infected_interval):
---------- ----------
file_name : str file_name : str
The name of the life cycle csv file to process The name of the life cycle csv file to process
path : str path : str
The full path to the dataset repository The full path to the dataset repository
predictive_interval : int predictive_interval : int
Predictive interval to apply in hours Predictive interval to apply in hours
infected_interval : int infected_interval : int
Infected interval to apply in hours Infected interval to apply in hours
...@@ -112,15 +116,15 @@ def process_cycle(file_name, path, predictive_interval, infected_interval): ...@@ -112,15 +116,15 @@ def process_cycle(file_name, path, predictive_interval, infected_interval):
#Read File #Read File
data = pd.read_csv(path+file_name) data = pd.read_csv(path+file_name)
data['date'] = pd.to_datetime(data['date']) data['date'] = pd.to_datetime(data['date'])
#Apply Predictive interval #Apply Predictive interval
date = pd.Timestamp((data.iloc[-1]['date'] - timedelta(hours=predictive_interval))) date = pd.Timestamp((data.iloc[-1]['date'] - timedelta(hours=predictive_interval)))
data.drop(data[data['date'] >= date].index,axis=0,inplace=True) data.drop(data[data['date'] >= date].index,axis=0,inplace=True)
data.reset_index(drop=True,inplace=True)
#Apply infected interval #Apply infected interval
if data.shape[0] > 0: if data.shape[0] > 0:
date = pd.Timestamp((data.iloc[0]['date'] + timedelta(hours=infected_interval))) date = pd.Timestamp((data.iloc[0]['date'] + timedelta(hours=infected_interval)))
data.drop(data[data['date'] <= date].index,axis=0,inplace=True) data.drop(data[data['date'] <= date].index,axis=0,inplace=True)
#Reset index
data.reset_index(drop=True,inplace=True) data.reset_index(drop=True,inplace=True)
if data.shape[0] > 0: if data.shape[0] > 0:
return (data, y) return (data, y)
...@@ -134,6 +138,26 @@ life_cycles = np.asarray([process_cycle(file_name, dataset_path, ...@@ -134,6 +138,26 @@ life_cycles = np.asarray([process_cycle(file_name, dataset_path,
predictive_padding_hours, predictive_padding_hours,
extended_infected_interval_hours) for file_name in file_list]) extended_infected_interval_hours) for file_name in file_list])
def last_X_days(data, y, X, min_data=0.33):
if (data.iloc[-1]['date'] - data.iloc[0]['date']) >= timedelta(days=X+1):
lim_date = pd.Timestamp(data.iloc[-1]['date'].date())
#Remove not complete day
data = data[data['date'] < lim_date]
#Remove
date = pd.Timestamp((lim_date - timedelta(days=X)))
data = data.drop(data[data['date'] < date].index,axis=0)
if data.shape[0] > ((X*24*60)/int(resample_freq[0:2]))*min_data:
return data,y
else:
return None
else:
return None
life_cycles = np.asarray([last_X_days(x[0],x[1],n_days) for x in life_cycles if x is not None])
print('\nData Loaded') print('\nData Loaded')
# # Define data encoding functions # # Define data encoding functions
...@@ -142,9 +166,9 @@ print('\nData Loaded') ...@@ -142,9 +166,9 @@ print('\nData Loaded')
codes = [] codes = []
for x in [data[0]['cod_evt'].unique() for data in life_cycles if data is not None]: for x in [x[0]['cod_evt'].unique() for x in life_cycles if x is not None]:
codes.extend(x) codes.extend(x)
codes = np.unique(codes) #Unique event codes present in the data, in increasing order codes = np.unique(codes) #Unique event codes present in the data, in increasing order
# In[6]: # In[6]:
...@@ -162,7 +186,7 @@ def get_R3_dict(codes, spacing=200): ...@@ -162,7 +186,7 @@ def get_R3_dict(codes, spacing=200):
"GA12500","GA13000","GA13500","GA14000","GA14500","GA15000","GA15100","GA15200", "GA12500","GA13000","GA13500","GA14000","GA14500","GA15000","GA15100","GA15200",
"GA17000","GA17002","GA20000","GA21000"] "GA17000","GA17002","GA20000","GA21000"]
OK_codes = [x for x in codes if x in OK_codes] OK_codes = [x for x in codes if x in OK_codes]
WAR_codes = ["GA02002","GA02003","GA02005","GA02006","GA02007","GA02008","GA03002","GA03003","GA03004", WAR_codes = ["GA02002","GA02003","GA02005","GA02006","GA02007","GA02008","GA03002","GA03003","GA03004",
"GA03005","GA03006","GA04002","GA04003","GA04004","GA04005","GA04006","GA04006","GA04007", "GA03005","GA03006","GA04002","GA04003","GA04004","GA04005","GA04006","GA04006","GA04007",
"GA05002","GA05003","GA06002","GA07002","GA07003","GA08001","GA08002","GA08003","GA10013", "GA05002","GA05003","GA06002","GA07002","GA07003","GA08001","GA08002","GA08003","GA10013",
...@@ -172,19 +196,19 @@ def get_R3_dict(codes, spacing=200): ...@@ -172,19 +196,19 @@ def get_R3_dict(codes, spacing=200):
"GA19003","GA19004","GA19005","GA20002","GA20003","GA20004","GA20005","GA21001","GA21002", "GA19003","GA19004","GA19005","GA20002","GA20003","GA20004","GA20005","GA21001","GA21002",
"GA21003"] "GA21003"]
WAR_codes = [x for x in codes if x in WAR_codes] WAR_codes = [x for x in codes if x in WAR_codes]
KO_codes = ["GA01001","GA02001","GA03001","GA04001","GA05001","GA06001","GA07001","GA10001","GA10012", KO_codes = ["GA01001","GA02001","GA03001","GA04001","GA05001","GA06001","GA07001","GA10001","GA10012",
"GA10016","GA10021","GA10025","GA10032","GA10501","GA11001","GA12001", "GA10016","GA10021","GA10025","GA10032","GA10501","GA11001","GA12001",
"GA13001","GA14001","GA15001","GA15102","GA15202", "GA13001","GA14001","GA15001","GA15102","GA15202",
"GA17001","GA17003","GA20001","GA21004"] "GA17001","GA17003","GA20001","GA21004"]
KO_codes = [x for x in codes if x in KO_codes] KO_codes = [x for x in codes if x in KO_codes]
R_codes = ["GA40000","GA41000","GA42000"] R_codes = ["GA40000","GA41000","GA42000"]
R_codes = [x for x in codes if x in R_codes] R_codes = [x for x in codes if x in R_codes]
I_codes = ["GA30000"] I_codes = ["GA30000"]
I_codes = [x for x in codes if x in I_codes] I_codes = [x for x in codes if x in I_codes]
if set(OK_codes + WAR_codes + KO_codes + R_codes + I_codes) != set(codes): if set(OK_codes + WAR_codes + KO_codes + R_codes + I_codes) != set(codes):
warnings.warn("get_R3_dict : Following codes did not fit in the OK/KO/WAR paradigm or were not related to hardware events and were discarded:\n{}".format(set(codes)-set(OK_codes + WAR_codes + KO_codes + R_codes + I_codes))) warnings.warn("get_R3_dict : Following codes did not fit in the OK/KO/WAR paradigm or were not related to hardware events and were discarded:\n{}".format(set(codes)-set(OK_codes + WAR_codes + KO_codes + R_codes + I_codes)))
...@@ -195,8 +219,9 @@ def get_R3_dict(codes, spacing=200): ...@@ -195,8 +219,9 @@ def get_R3_dict(codes, spacing=200):
dict_codes.update({code:k+i}) dict_codes.update({code:k+i})
k+=spacing k+=spacing
return dict_codes return dict_codes
def get_R4_dict(codes): def get_R4_dict(codes):
codes = np.append(codes, ['-1'])
vals = np.arange(codes.shape[0]) vals = np.arange(codes.shape[0])
np.random.shuffle(vals) np.random.shuffle(vals)
return {code : vals[i] for i,code in enumerate(codes)} return {code : vals[i] for i,code in enumerate(codes)}
...@@ -210,278 +235,319 @@ def apply_code_dict(df, code_dic, code_column='cod_evt'): ...@@ -210,278 +235,319 @@ def apply_code_dict(df, code_dic, code_column='cod_evt'):
return df return df
# In[11]:
# # Define pipelines # # Define pipelines
# We now define the pipelines that we will use for crossvalidation # We now define the pipelines that we will use for crossvalidation
max_features=100
# In[11]:
pipeline_dict = {} pipeline_dict = {}
#FLATTENED IMAGE #FLATTENED IMAGE
pipeline_dict.update({"PAA Gramian Flat RF":make_pipeline(Gramian_transform(flatten=True), pipeline_dict.update({"Gramian Flat RF":make_pipeline(Gramian_transform(flatten=True),
Random_Forest())}) Random_Forest())})
pipeline_dict.update({"PAA Recurrence Flat RF":make_pipeline(Recurrence_transform(flatten=True), pipeline_dict.update({"Recurrence Flat RF":make_pipeline(Recurrence_transform(flatten=True),
Random_Forest())}) Random_Forest())})
pipeline_dict.update({"PAA Gramian Flat SVM":make_pipeline(Gramian_transform(flatten=True), pipeline_dict.update({"Gramian Flat SVM":make_pipeline(Gramian_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA Recurrence Flat SVM":make_pipeline(Recurrence_transform(flatten=True), pipeline_dict.update({"Recurrence Flat SVM":make_pipeline(Recurrence_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA Gramian Flat KNN":make_pipeline(Gramian_transform(flatten=True), pipeline_dict.update({"Gramian Flat KNN":make_pipeline(Gramian_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())}) KNN_classif())})
pipeline_dict.update({"PAA Recurrence Flat KNN":make_pipeline(Recurrence_transform(flatten=True), pipeline_dict.update({"Recurrence Flat KNN":make_pipeline(Recurrence_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())}) KNN_classif())})
pipeline_dict.update({"PAA Gramian Flat Ridge":make_pipeline(Gramian_transform(flatten=True), pipeline_dict.update({"Gramian Flat Ridge":make_pipeline(Gramian_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Ridge_classif())}) Ridge_classif())})
pipeline_dict.update({"PAA Recurrence Flat Ridge":make_pipeline(Recurrence_transform(flatten=True), pipeline_dict.update({"Recurrence Flat Ridge":make_pipeline(Recurrence_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Ridge_classif())}) Ridge_classif())})
#TIME SERIE CLASSIFIERS + PAA #TIME SERIE CLASSIFIERS + PAA
pipeline_dict.update({"PAA TSRF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"TSRF":make_pipeline(TimeSeries_Forest())})
TimeSeries_Forest())})
pipeline_dict.update({"PAA BOSSVS":make_pipeline(PiecewiseApproximation_transform(output_size=size),
BOSSVS_classif())})
pipeline_dict.update({"PAA KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"BOSSVS":make_pipeline(BOSSVS_classif())})
KNN_TS_classif())})
pipeline_dict.update({"PAA RISE":make_pipeline(PiecewiseApproximation_transform(output_size=size),
RISE())})
#TIME SERIE CLASSIFIERS + PAA + SAX pipeline_dict.update({"KNN":make_pipeline(KNN_TS_classif())})
pipeline_dict.update({"PAA SAX TSRF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"RISE":make_pipeline(RISE())})
SymbolicAggregate_transform(),
#TIME SERIE CLASSIFIERS + SAX
pipeline_dict.update({"SAX TSRF":make_pipeline(SymbolicAggregate_transform(),
TimeSeries_Forest())}) TimeSeries_Forest())})
pipeline_dict.update({"PAA SAX BOSSVS":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX BOSSVS":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
BOSSVS_classif())}) BOSSVS_classif())})
pipeline_dict.update({"PAA SAX KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX KNN":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
KNN_TS_classif())}) KNN_TS_classif())})
pipeline_dict.update({"PAA SAX RISE":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX RISE":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
RISE())}) RISE())})
#TIME SERIE CLASSIFIERS + PAA + SFA #TIME SERIE CLASSIFIERS + SFA
pipeline_dict.update({"PAA SFA TSRF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SFA TSRF":make_pipeline(SymbolicFourrier_transform(),
SymbolicFourrier_transform(),
TimeSeries_Forest())}) TimeSeries_Forest())})
#BOSSVS natively perform SFA on input so no point in testing it here #BOSSVS natively perform SFA on input so no point in testing it here
pipeline_dict.update({"PAA SFA KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SFA KNN":make_pipeline(SymbolicFourrier_transform(),
SymbolicFourrier_transform(),
KNN_TS_classif())}) KNN_TS_classif())})
#RISE apply techniques such as power spectrum and autocorrelation that are supposed to be applied in the time domain. #RISE apply techniques such as power spectrum and autocorrelation that are supposed to be applied in the time domain.
#SFA use Fourrier transform (DFT) and they binning with MCB, the result of this operation is not in the time domain anymore. #SFA use Fourrier transform (DFT) and they binning with MCB, the result of this operation is not in the time domain anymore.
#TIME SERIE CLASSIFIERS + PAA + MATRIX PROFILE #TIME SERIE CLASSIFIERS + MATRIX PROFILE
pipeline_dict.update({"PAA MP TSRF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP TSRF":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
TimeSeries_Forest())}) TimeSeries_Forest())})
pipeline_dict.update({"PAA MP BOSSVS":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP BOSSVS":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
BOSSVS_classif())}) BOSSVS_classif())})
pipeline_dict.update({"PAA MP KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP KNN":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
KNN_TS_classif())}) KNN_TS_classif())})
pipeline_dict.update({"PAA MP RISE":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP RISE":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
RISE())}) RISE())})
#PAA + ROCKET # ROCKET
pipeline_dict.update({"PAA ROCKET RF":make_pipeline(PiecewiseApproximation_transform(output_size=size),
ROCKET_transform(flatten=True), pipeline_dict.update({"ROCKET RF":make_pipeline(ROCKET_transform(flatten=True),
MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Random_Forest())}) Random_Forest())})
pipeline_dict.update({"PAA ROCKET SVM":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"ROCKET SVM":make_pipeline(ROCKET_transform(flatten=True),
ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA ROCKET KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"ROCKET KNN":make_pipeline(ROCKET_transform(flatten=True),
ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())}) KNN_classif())})
pipeline_dict.update({"PAA ROCKET Ridge":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"ROCKET Ridge":make_pipeline(ROCKET_transform(flatten=True),
ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Ridge_classif())}) Ridge_classif())})
#PAA + MATRIX PROFILE + ROCKET # MATRIX PROFILE + ROCKET
pipeline_dict.update({"PAA MP ROCKET RF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP ROCKET RF":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Random_Forest())}) Random_Forest())})
pipeline_dict.update({"MP ROCKET SVM":make_pipeline(MatrixProfile_transform(),
pipeline_dict.update({"PAA MP ROCKET SVM":make_pipeline(PiecewiseApproximation_transform(output_size=size),
MatrixProfile_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA MP ROCKET KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP ROCKET KNN":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())}) KNN_classif())})
pipeline_dict.update({"PAA MP ROCKET Ridge":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP ROCKET Ridge":make_pipeline(MatrixProfile_transform(),
MatrixProfile_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Ridge_classif())}) Ridge_classif())})
#PAA + SAX + ROCKET #SAX + ROCKET
pipeline_dict.update({"PAA SAX ROCKET RF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX ROCKET RF":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
Random_Forest())}) Random_Forest())})
pipeline_dict.update({"PAA SAX ROCKET Ridge":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX ROCKET Ridge":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Ridge_classif())}) Ridge_classif())})
pipeline_dict.update({"PAA SAX ROCKET SVM":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX ROCKET SVM":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA SAX ROCKET KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"SAX ROCKET KNN":make_pipeline(SymbolicAggregate_transform(),
SymbolicAggregate_transform(),
ROCKET_transform(flatten=True), ROCKET_transform(flatten=True),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())}) KNN_classif())})
#ROCKET on SFA is not efficient, rocket can already extract frequency based features due to the nature of convolutional kernels. #ROCKET on SFA is not efficient, rocket can already extract frequency based features due to the nature of convolutional kernels.
#PAA + MP + STACKED FLAT IMAGES #MP + STACKED FLAT IMAGES
pipeline_dict.update({"PAA MP Gramian + Recurrence RF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP Gramian + Recurrence RF":make_pipeline(
MatrixProfile_transform(), MatrixProfile_transform(),
FeatureUnion([ FeatureUnion([
("gramian",Gramian_transform(flatten=True)), ("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True)) ("recurrence",Recurrence_transform(flatten=True))
]), ]),
MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Random_Forest())}) Random_Forest())})
pipeline_dict.update({"PAA MP Gramian + Recurrence SVM":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP Gramian + Recurrence SVM":make_pipeline(
MatrixProfile_transform(), MatrixProfile_transform(),
FeatureUnion([ FeatureUnion([
("gramian",Gramian_transform(flatten=True)), ("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True)) ("recurrence",Recurrence_transform(flatten=True))
]), ]),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA MP Gramian + Recurrence KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"MP Gramian + Recurrence KNN":make_pipeline(
MatrixProfile_transform(),
FeatureUnion([
("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True))
]),
MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())})
pipeline_dict.update({"MP Gramian + Recurrence Ridge":make_pipeline(
MatrixProfile_transform(), MatrixProfile_transform(),
FeatureUnion([ FeatureUnion([
("gramian",Gramian_transform(flatten=True)), ("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True)) ("recurrence",Recurrence_transform(flatten=True))
]), ]),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
KNN_classif())}) max_features=max_features, threshold=0.000001),
Ridge_classif())})
pipeline_dict.update({"PAA Gramian + Recurrence RF":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"Gramian + Recurrence RF":make_pipeline(
FeatureUnion([ FeatureUnion([
("gramian",Gramian_transform(flatten=True)), ("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True)) ("recurrence",Recurrence_transform(flatten=True))
]), ]),
MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Random_Forest())}) Random_Forest())})
#PAA + STACKED FLAT IMAGES pipeline_dict.update({"Gramian + Recurrence SVM":make_pipeline(
pipeline_dict.update({"PAA Gramian + Recurrence SVM":make_pipeline(PiecewiseApproximation_transform(output_size=size),
FeatureUnion([ FeatureUnion([
("gramian",Gramian_transform(flatten=True)), ("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True)) ("recurrence",Recurrence_transform(flatten=True))
]), ]),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
SVM_classif())}) SVM_classif())})
pipeline_dict.update({"PAA Gramian + Recurrence KNN":make_pipeline(PiecewiseApproximation_transform(output_size=size), pipeline_dict.update({"Gramian + Recurrence KNN":make_pipeline(
FeatureUnion([ FeatureUnion([
("gramian",Gramian_transform(flatten=True)), ("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True)) ("recurrence",Recurrence_transform(flatten=True))
]), ]),
MinMaxScaler(), MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"), threshold=0.000001), SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
KNN_classif())}) KNN_classif())})
"""
pipeline_dict.update({"Gramian + Recurrence Ridge":make_pipeline(
FeatureUnion([
("gramian",Gramian_transform(flatten=True)),
("recurrence",Recurrence_transform(flatten=True))
]),
MinMaxScaler(),
SelectFromModel(ExtraTreesClassifier(n_estimators=300, class_weight="balanced_subsample"),
max_features=max_features, threshold=0.000001),
Ridge_classif())})
#This section is left commented so you have no trouble running the script without Tensorflow/GPU #This section is left commented so you have no trouble running the script without Tensorflow/GPU
#If you have error during cross validation, you can try to make the class ResNetV2 #If you have error during cross validation, you can try to make the class ResNetV2
# inherit the tensorflow.keras KerasClassifier wrapper, it can fix some issues. # inherit the tensorflow.keras KerasClassifier wrapper, it can fix some issues.
pipeline_dict.update({"PAA Gramian ResNet50V2":make_pipeline(Gramian_transform(flatten=True), """
pipeline_dict.update({"Gramian ResNet50V2":make_pipeline(Gramian_transform(flatten=True),
ResNetV2())}) ResNetV2())})
pipeline_dict.update({"PAA Recurrence ResNet50V2":make_pipeline(Recurrence_transform(flatten=True), pipeline_dict.update({"Recurrence ResNet50V2":make_pipeline(Recurrence_transform(flatten=True),
ResNetV2())}) ResNetV2())})
pipeline_dict.update({"InceptionTime":make_pipeline(InceptionTime())})
pipeline_dict.update({"MP InceptionTime":make_pipeline(MatrixProfile_transform(),
InceptionTime())})
pipeline_dict.update({"SAX InceptionTime":make_pipeline(SymbolicAggregate_transform(),
InceptionTime())})
""" """
print('Pipelines initialized') print('Pipelines initialized')
# In[12]: # In[12]:
# Critical Failure Index (CFI). As True Negatives implies that no maintenance is scheduled (so no business impact), # Critical Failure Index (CFI). As True Negatives implies that no maintenance is scheduled (so no business impact),
# this measure indicate how many maintenance operation we "missed" (False Negatives) plus how many we did # this measure indicate how many maintenance operation we "missed" (False Negatives) plus how many we did
# while it was not necessary to do so (False Positives). Then those two variables are summed and # while it was not necessary to do so (False Positives). Then those two variables are summed and
# divided by their sum plus the number of successful prediction (True Positives). # divided by their sum plus the number of successful prediction (True Positives).
# In short, the closer to 0, the more the system is "business" efficient. # In short, the closer to 0, the more the system is "business" efficient.
def CFI(y_test, y_pred): def CFI(y_test, y_pred):
if type(y_test) == list: if type(y_test) == list:
...@@ -509,39 +575,49 @@ df_res = pd.DataFrame(columns=['name','representation','balanced accuracy mean', ...@@ -509,39 +575,49 @@ df_res = pd.DataFrame(columns=['name','representation','balanced accuracy mean',
print('Cross Validation') print('Cross Validation')
order = {0:'R1',1:'R2',2:'R3',3:'R4'} order = {0:'R1',1:'R2',2:'R3',3:'R4'}
print("A total of {} runs will be launched".format(len(pipeline_dict)*n_splits*len(order)))
for i_r, dic_func in enumerate([get_R1_dict, get_R2_dict, get_R3_dict, get_R4_dict]): print("A total of {} runs will be launched".format(len(pipeline_dict)*n_splits*len(order)))
X = np.asarray([apply_code_dict(x.copy(deep=True),dic_func(codes))['cod_evt'].values for x in life_cycles[:,0] if x is not None],dtype=object) #, get_R2_dict, get_R3_dict, get_R4_dict
for i_r, dic_func in enumerate([get_R1_dict]):
code_dict = dic_func(codes)
if order[i_r] == 'R1':
fill_value = len(list(code_dict.values()))
elif order[i_r] == 'R2':
fill_value = np.max(list(code_dict.values())) + 1000
elif order[i_r] == 'R3':
fill_value = 1000
elif order[i_r] == 'R4':
fill_value = code_dict['-1']
X = [apply_code_dict(x[0].copy(deep=True), code_dict).resample(resample_freq,on='date',convention='end',origin='start_day').mean().fillna(fill_value) for x in life_cycles if x is not None]
y = np.asarray([x[1] for x in life_cycles if x is not None]).astype(int) y = np.asarray([x[1] for x in life_cycles if x is not None]).astype(int)
idx = np.where([x.shape[0]>=size for x in X])[0]
X = X[idx] X = np.asarray([x.reindex(pd.date_range(start=x.index[-1].date()-timedelta(days=n_days),
y = y[idx] end=x.index[-1].date(), freq=resample_freq)).fillna(fill_value).values for x in X])
print(X.shape)
print(np.bincount(y))
if TSCHIEF_path is not None: if TSCHIEF_path is not None:
skf = StratifiedKFold(n_splits=n_splits) skf = StratifiedKFold(n_splits=n_splits)
df = pd.DataFrame(data = {i: x.reshape(-1) for i,x in enumerate(X)}).transpose()
paa = PiecewiseApproximation_transform(size) df[X.shape[1]]=y
X_paa = paa.transform(X)
y_paa = y
df = pd.DataFrame(data = {i: x.reshape(-1) for i,x in enumerate(X_paa)}).transpose()
df[size]=y_paa
df = df.astype(np.float32) df = df.astype(np.float32)
i_split=0 i_split=0
for train_idx, test_idx in skf.split(X,y): for train_idx, test_idx in skf.split(X,y):
df.loc[train_idx].to_csv(TSCHIEF_path+'data_Train_{}_{}_{}.csv'.format(size, i_split, order[i_r]),index=False,header=False) df.loc[train_idx].to_csv(TSCHIEF_path+'data_Train_{}_{}_{}.csv'.format(X.shape[1], i_split, order[i_r]),index=False,header=False)
df.loc[test_idx].to_csv(TSCHIEF_path+'data_Test_{}_{}_{}.csv'.format(size, i_split, order[i_r]),index=False,header=False) df.loc[test_idx].to_csv(TSCHIEF_path+'data_Test_{}_{}_{}.csv'.format(X.shape[1], i_split, order[i_r]),index=False,header=False)
i_split+=1 i_split+=1
if do_cross_validation: if do_cross_validation:
for pipeline in pipeline_dict: for pipeline in pipeline_dict:
try: try:
cv = cross_validate(pipeline_dict[pipeline],X, y, cv=n_splits, n_jobs=n_cv_jobs, cv = cross_validate(pipeline_dict[pipeline],X, y, cv=n_splits, n_jobs=n_cv_jobs,
scoring={'b_a':make_scorer(balanced_accuracy_score), scoring={'b_a':make_scorer(balanced_accuracy_score),
'cfi':make_scorer(CFI), 'cfi':make_scorer(CFI),
'f1':make_scorer(f1_score)}) 'f1':make_scorer(f1_score)})
except Exception as e: except Exception as e:
print(e) print(e)
...@@ -574,17 +650,16 @@ for i_r, dic_func in enumerate([get_R1_dict, get_R2_dict, get_R3_dict, get_R4_di ...@@ -574,17 +650,16 @@ for i_r, dic_func in enumerate([get_R1_dict, get_R2_dict, get_R3_dict, get_R4_di
'Score time std':[np.std(cv['score_time'])]}) 'Score time std':[np.std(cv['score_time'])]})
]) ])
# In[15]:
df_res.to_csv(result_path+'cv_results'+str(n_days)+'J_'+resample_freq+'.csv',sep=csv_separator, index=False)
df_res.to_csv(result_path+'cv_results.csv',sep=csv_separator, index=False) if produce_latex is not None:
if produce_latex is not None: df_dict = {'name':df_res['name'].unique()}
df_dict = {'name':df_res['name'].unique()} for col in ['balanced accuracy','CFI','F1 score','Fit time','Score time']:
for col in ['balanced accuracy','CFI','F1 score','Fit time','Score time']: for r in ['R1', 'R2', 'R3','R4']:
for r in ['R1', 'R2', 'R3','R4']: df_dict.update({col+' '+r:(df_res[df_res['representation']==r][col + ' mean'].astype(str).str[0:5] + '(+/- '+df_res[df_res['representation']==r][col+' std'].astype(str).str[0:5]+')').reset_index(drop=True)})
df_dict.update({col+' '+r:(df_res[df_res['representation']==r][col + ' mean'].astype(str).str[0:5] + '(+/- '+df_res[df_res['representation']==r][col+' std'].astype(str).str[0:5]+')').reset_index(drop=True)}) df_latex = pd.DataFrame(df_dict)
df_latex = pd.DataFrame(df_dict) df_latex.to_csv(result_path+'cv_results'+str(n_days)+'J_'+resample_freq+'_latex.csv',sep=csv_separator, index=False)
df_latex.to_csv(result_path+'cv_results_latex.csv',sep=csv_separator, index=False) latex_str = df_latex.sort_values(by=['CFI R3'],ascending=True).to_latex(index=False)
latex_str = df_latex.sort_values(by=['CFI R3'],ascending=True).to_latex(index=False) with open(produce_latex, 'w') as f:
with open(produce_latex, 'w') as f: f.write(latex_str)
f.write(latex_str)
...@@ -34,6 +34,12 @@ Configuration parameters are located at the beginning of CV_script, you MUST cha ...@@ -34,6 +34,12 @@ Configuration parameters are located at the beginning of CV_script, you MUST cha
To change or check the algorithms parameters, they all are redefined in custom wrapper classes to avoid errors, if a parameter is not specified in the constructor, it is left as default. To change or check the algorithms parameters, they all are redefined in custom wrapper classes to avoid errors, if a parameter is not specified in the constructor, it is left as default.
The representations methods are defined inside utils.representations and the classifications methods inside utils.classifications. The representations methods are defined inside utils.representations and the classifications methods inside utils.classifications.
To change the parameter of TS-CHIEF, you can change the values of the following arguments in the ts-chief script:
```bash
-trees="300" -s="ee:4,boss:50,rise:50"
```
If you want to give more predictive power to this algorithm, increasing the number of trees and the number of random split generated by each method (boss, rise, ...) is the way to go. We used those value to avoid memory errors, the shorter the input time series, the higher those values can be without causing trouble.
## Usage ## Usage
Extract the files of the dataset archive located in ~/datasets in the dataset folder Extract the files of the dataset archive located in ~/datasets in the dataset folder
...@@ -74,8 +80,6 @@ by ...@@ -74,8 +80,6 @@ by
from sktime.utils.data_container import tabularize, from_3d_numpy_to_nested from sktime.utils.data_container import tabularize, from_3d_numpy_to_nested
``` ```
* We also modified InceptionTime to use binary_crossentropy (change loss name and use sigmod layer with 1 neuron as an output) and weighted accuracy for early stopping. This is not mandatory but is more suited to our problem.
## Contributing ## Contributing
If any bug should occur, please open a issue so we can work on a fix ! If any bug should occur, please open a issue so we can work on a fix !
......
#!/bin/bash #!/bin/bash
n_cv=9 n_cv=9
n_r=4 n_r=4
size=1000 size=1513
for id_r in `seq 1 $n_r` for id_r in `seq 1 $n_r`
do do
for id_cv in `seq 0 $n_cv` for id_cv in `seq 0 $n_cv`
do do
jdk/jdk-15/bin/java -jar tschief.jar -train="datasets/TSCHIEF/data_Train_"$size"_"$id_cv"_R"$id_r".csv" -test="datasets/TSCHIEF/data_Test_"$size"_"$id_cv"_R"$id_r".csv" -out="results/TSCHIEF/" -repeats="1" -trees="500" -s="ee:10,boss:150,rise:150" -export="1" -verbosity="1" -shuffle="True" -target_column="last" jdk/jdk-15/bin/java -Xms6G -Xmx12G -jar tschief.jar -train="datasets/TSCHIEF/data_Train_"$size"_"$id_cv"_R"$id_r".csv" -test="datasets/TSCHIEF/data_Test_"$size"_"$id_cv"_R"$id_r".csv" -out="results/TSCHIEF/" -repeats="1" -trees="300" -s="ee:4,boss:50,rise:50" -export="1" -verbosity="1" -shuffle="True" -target_column="last"
done done
done done
...@@ -9,20 +9,20 @@ from sklearn.svm import SVC ...@@ -9,20 +9,20 @@ from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier from sklearn.linear_model import RidgeClassifier
from sktime.classification.interval_based import TimeSeriesForest from sktime.classification.interval_based import TimeSeriesForest
from sktime.utils.data_container import concat_nested_arrays as cna from sktime.utils.data_container import _concat_nested_arrays as cna
from sktime.classification.frequency_based import RandomIntervalSpectralForest from sktime.classification.frequency_based import RandomIntervalSpectralForest
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
# # Define classes for classification methods # # Define classes for classification methods
# Here we define custom classes when necessary for the classification methods we will use inside pipelines during cross validation. # Here we define custom classes when necessary for the classification methods we will use inside pipelines during cross validation.
# #
# See corresponding modules documentation for documentation. # See corresponding modules documentation for documentation.
# #
# Pyts : https://pyts.readthedocs.io/ # Pyts : https://pyts.readthedocs.io/
# #
# sktime : https://sktime.org/index.html # sktime : https://sktime.org/index.html
# #
# sklearn : https://scikit-learn.org/stable/index.html # sklearn : https://scikit-learn.org/stable/index.html
...@@ -58,15 +58,14 @@ class ResNetV2(BaseEstimator, ClassifierMixin): ...@@ -58,15 +58,14 @@ class ResNetV2(BaseEstimator, ClassifierMixin):
classes=1, classes=1,
classifier_activation="sigmoid", classifier_activation="sigmoid",
) )
model.compile(optimizer=self.optimizer, loss=self.loss, weighted_metrics=['accuracy']) model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
self.model = model self.model = model
def fit(self, X, y, epochs=1000, batch_size=32, return_hist=False, el_patience=70, verbose=0, val_size=0.1): def fit(self, X, y, epochs=1500, batch_size=32, return_hist=False, el_patience=100, verbose=0, val_size=0.1):
self.init_model((X.shape[1], X.shape[2], X.shape[3])) self.init_model((X.shape[1], X.shape[2], X.shape[3]))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size)
el = EarlyStopping(monitor='val_accuracy', patience=el_patience, restore_best_weights=True, mode='max') el = EarlyStopping(monitor='val_loss', patience=el_patience, restore_best_weights=True, mode='min')
cw = compute_class_weight('balanced', np.unique(y_train), y_train)
self.model.fit( self.model.fit(
X_train, y_train, X_train, y_train,
validation_data=(X_val,y_val), validation_data=(X_val,y_val),
...@@ -74,18 +73,17 @@ class ResNetV2(BaseEstimator, ClassifierMixin): ...@@ -74,18 +73,17 @@ class ResNetV2(BaseEstimator, ClassifierMixin):
batch_size=batch_size, batch_size=batch_size,
verbose=verbose, verbose=verbose,
callbacks=[el], callbacks=[el],
shuffle=True, shuffle=True
class_weight={0:cw[0],1:cw[1]}
) )
return self return self
def predict(self, X): def predict(self, X):
return np.array([x>0.5 for x in self.model.predict(X)]).astype(int) return np.array([x>0.5 for x in self.model.predict(X)]).astype(int)
def predict_proba(self,X): def predict_proba(self,X):
return self.model.predict(X) return self.model.predict(X)
#Depending on your sktime_dl version, this might throw import errors, see Readme for a fix.
from sktime_dl.deeplearning.inceptiontime._classifier import InceptionTimeClassifier from sktime_dl.deeplearning.inceptiontime._classifier import InceptionTimeClassifier
...@@ -100,150 +98,153 @@ class InceptionTime(BaseEstimator, ClassifierMixin): ...@@ -100,150 +98,153 @@ class InceptionTime(BaseEstimator, ClassifierMixin):
el_patience=100, verbose=False, val_size=0.1): el_patience=100, verbose=False, val_size=0.1):
self.model = InceptionTimeClassifier(verbose=verbose, depth=self.depth, self.model = InceptionTimeClassifier(verbose=verbose, depth=self.depth,
nb_filters=self.nb_filters, bottleneck_size=self.bottleneck_size, nb_filters=self.nb_filters, bottleneck_size=self.bottleneck_size,
callbacks=[EarlyStopping(monitor='val_accuracy', patience=el_patience, callbacks=[EarlyStopping(monitor='val_loss', patience=el_patience,
restore_best_weights=True, mode='max')]) restore_best_weights=True, mode='min')])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size)
self.model.fit(X_train, y_train, validation_X=X_val,validation_y=y_val) self.model.fit(X_train, y_train, validation_X=X_val,validation_y=y_val)
return self return self
def predict(self, X): def predict(self, X):
return np.array([x>0.5 for x in self.model.predict(X)]).astype(int) return np.array([x>0.5 for x in self.model.predict(X)]).astype(int)
def predict_proba(self,X): def predict_proba(self,X):
return self.model.predict(X) return self.model.predict(X)
""" """
class SktimeEstimator: class SktimeEstimator:
def _sktime_format(self,X,y): def _sktime_format(self,X,y):
# X : (n_instance, n_timestamp, n_features) # X : (n_instance, n_timestamp, n_features)
X, y = self._sktime_format_X(X), np.asarray(y) X, y = self._sktime_format_X(X), np.asarray(y)
return X, y return X, y
def _sktime_format_X(self,X): def _sktime_format_X(self,X):
# X : (n_instance, n_timestamp, n_features) # X : (n_instance, n_timestamp, n_features)
return cna(X.reshape(X.shape[2],X.shape[0],X.shape[1])) return cna(X.reshape(X.shape[2],X.shape[0],X.shape[1]))
class PytsEstimator: class PytsEstimator:
def _format(self,X,y): def _format(self,X,y):
return self._format_X(X), np.asarray(y) return self._format_X(X), np.asarray(y)
def _format_X(self,X): def _format_X(self,X):
return X.reshape(X.shape[0],X.shape[1]) return X.reshape(X.shape[0],X.shape[1])
class RISE(BaseEstimator, ClassifierMixin, SktimeEstimator): class RISE(BaseEstimator, ClassifierMixin, SktimeEstimator):
def __init__(self, min_length=5, n_estimators=300): def __init__(self, min_length=5, n_estimators=300):
self.min_length = min_length self.min_length = min_length
self.n_estimators = n_estimators self.n_estimators = n_estimators
self.estimator = None self.estimator = None
def fit(self,X,y): def fit(self,X,y):
X, y = self._sktime_format(X,y) X, y = self._sktime_format(X,y)
self.estimator = RandomIntervalSpectralForest(n_estimators=self.n_estimators, min_interval=self.min_length) self.estimator = RandomIntervalSpectralForest(n_estimators=self.n_estimators,
min_interval=self.min_length)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
X = self._sktime_format_X(X) X = self._sktime_format_X(X)
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
X = self._sktime_format(X) X = self._sktime_format(X)
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
class Random_Forest(BaseEstimator, ClassifierMixin): class Random_Forest(BaseEstimator, ClassifierMixin):
def __init__(self, n_estimators=300, max_depth=None, max_features=0.75, max_samples=0.75, def __init__(self, n_estimators=300, max_depth=None, max_samples=0.75,
ccp_alpha=0.0225, class_weight="balanced_subsample"): ccp_alpha=0.0225, class_weight="balanced_subsample"):
self.n_estimators=n_estimators self.n_estimators=n_estimators
self.max_depth=max_depth self.max_depth=max_depth
self.max_features=max_features
self.max_samples=max_samples self.max_samples=max_samples
self.ccp_alpha=ccp_alpha self.ccp_alpha=ccp_alpha
self.class_weight=class_weight self.class_weight=class_weight
self.estimator = None self.estimator = None
def fit(self, X, y): def fit(self, X, y):
X = np.asarray([x.astype(np.float32) for x in X]) X = np.asarray([x.astype(np.float32) for x in X])
self.estimator = RandomForestClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, self.estimator = RandomForestClassifier(n_estimators=self.n_estimators,
max_features=self.max_features, max_samples=self.max_samples, max_depth=self.max_depth,
ccp_alpha=self.ccp_alpha,class_weight=self.class_weight) max_samples=self.max_samples,
ccp_alpha=self.ccp_alpha,
class_weight=self.class_weight)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
X = np.asarray([x.astype(np.float32) for x in X]) X = np.asarray([x.astype(np.float32) for x in X])
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
X = np.asarray([x.astype(np.float32) for x in X]) X = np.asarray([x.astype(np.float32) for x in X])
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
class KNN_classif(BaseEstimator, ClassifierMixin): class KNN_classif(BaseEstimator, ClassifierMixin):
def __init__(self, n_neighbors=9, weights='distance',p=2): def __init__(self, n_neighbors=7, weights='distance',p=2):
self.n_neighbors = n_neighbors self.n_neighbors = n_neighbors
self.weights = weights self.weights = weights
self.p = p self.p = p
self.estimator = None self.estimator = None
def fit(self,X,y): def fit(self,X,y):
self.estimator = KNeighborsClassifier(n_neighbors=self.n_neighbors, weights=self.weights, p=self.p) self.estimator = KNeighborsClassifier(n_neighbors=self.n_neighbors,
weights=self.weights, p=self.p)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
class TimeSeries_Forest(BaseEstimator, ClassifierMixin, SktimeEstimator): class TimeSeries_Forest(BaseEstimator, ClassifierMixin, SktimeEstimator):
def __init__(self, n_estimators=300, min_interval=3): def __init__(self, n_estimators=300, min_interval=5):
self.n_estimators = n_estimators self.n_estimators = n_estimators
self.min_interval = min_interval self.min_interval = min_interval
self.estimator = None self.estimator = None
def fit(self,X,y): def fit(self,X,y):
X, y = self._sktime_format(X,y) X, y = self._sktime_format(X,y)
self.estimator = TimeSeriesForest(n_estimators=self.n_estimators, self.estimator = TimeSeriesForest(n_estimators=self.n_estimators,
min_interval=self.min_interval) min_interval=self.min_interval)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
X = self._sktime_format_X(X) X = self._sktime_format_X(X)
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
X = self._sktime_format_X(X) X = self._sktime_format_X(X)
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
class SVM_classif(BaseEstimator, ClassifierMixin): class SVM_classif(BaseEstimator, ClassifierMixin):
def __init__(self, C=10, kernel='rbf', degree=2, gamma='scale', def __init__(self, C=10, kernel='rbf', degree=2, gamma='scale',
cache_size=500, class_weight='balanced'): cache_size=500, class_weight='balanced'):
self.C = C self.C = C
self.kernel = kernel self.kernel = kernel
self.degree = degree self.degree = degree #Not used with RBF
self.gamma = gamma self.gamma = gamma
self.cache_size = cache_size self.cache_size = cache_size
self.class_weight = class_weight self.class_weight = class_weight
self.estimator = None self.estimator = None
def fit(self,X,y): def fit(self,X,y):
self.estimator = SVC(C=self.C, kernel=self.kernel, degree=self.degree, self.estimator = SVC(C=self.C, kernel=self.kernel, degree=self.degree,
gamma=self.gamma, cache_size=self.cache_size, gamma=self.gamma, cache_size=self.cache_size,
class_weight=self.class_weight) class_weight=self.class_weight)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
...@@ -257,7 +258,7 @@ class Ridge_classif(BaseEstimator, ClassifierMixin): ...@@ -257,7 +258,7 @@ class Ridge_classif(BaseEstimator, ClassifierMixin):
self.tol = tol self.tol = tol
self.class_weight = class_weight self.class_weight = class_weight
self.estimator = None self.estimator = None
def set_params(self, **params): def set_params(self, **params):
return self.estimator.set_params(**params) return self.estimator.set_params(**params)
...@@ -267,40 +268,39 @@ class Ridge_classif(BaseEstimator, ClassifierMixin): ...@@ -267,40 +268,39 @@ class Ridge_classif(BaseEstimator, ClassifierMixin):
tol=self.tol, class_weight=self.class_weight) tol=self.tol, class_weight=self.class_weight)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
class KNN_TS_classif(BaseEstimator, ClassifierMixin, PytsEstimator): class KNN_TS_classif(BaseEstimator, ClassifierMixin, PytsEstimator):
def __init__(self, n_neighbors=9, weights='distance', p=2): def __init__(self, n_neighbors=7, weights='distance', p=2):
self.n_neighbors = n_neighbors self.n_neighbors = n_neighbors
self.weights = weights self.weights = weights
self.p = p self.p = p
self.estimator = None self.estimator = None
def fit(self,X,y): def fit(self,X,y):
X, y = self._format(X,y) X, y = self._format(X,y)
self.estimator = KNeighborsClassifierTS(n_neighbors=self.n_neighbors, self.estimator = KNeighborsClassifierTS(n_neighbors=self.n_neighbors,
weights=self.weights, p=self.p) weights=self.weights, p=self.p)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
X = self._format_X(X) X = self._format_X(X)
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
X = self._format_X(X) X = self._format_X(X)
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
class BOSSVS_classif(BaseEstimator, ClassifierMixin, PytsEstimator): class BOSSVS_classif(BaseEstimator, ClassifierMixin, PytsEstimator):
def __init__(self, word_size=9, n_bins=7, window_size=0.2, window_step=1, def __init__(self, word_size=5, n_bins=5, window_size=0.15, window_step=0.01,
anova=True, drop_sum=False, norm_mean=False, norm_std=False, anova=True, drop_sum=False, norm_mean=False, norm_std=False,
strategy='uniform', alphabet=None): strategy='quantile', alphabet=None,smooth_idf=True):
self.word_size = word_size self.word_size = word_size
self.n_bins = n_bins self.n_bins = n_bins
self.window_size = window_size self.window_size = window_size
...@@ -311,23 +311,25 @@ class BOSSVS_classif(BaseEstimator, ClassifierMixin, PytsEstimator): ...@@ -311,23 +311,25 @@ class BOSSVS_classif(BaseEstimator, ClassifierMixin, PytsEstimator):
self.norm_std = norm_std self.norm_std = norm_std
self.strategy = strategy self.strategy = strategy
self.alphabet = alphabet self.alphabet = alphabet
self.smooth_idf = smooth_idf
self.estimator = None self.estimator = None
def fit(self,X,y): def fit(self,X,y):
X, y = self._format(X,y) X, y = self._format(X,y)
self.estimator = BOSSVS(word_size=self.word_size, n_bins=self.n_bins, self.estimator = BOSSVS(word_size=self.word_size, n_bins=self.n_bins,
window_size=self.window_size, window_step=self.window_step, window_size=self.window_size, window_step=self.window_step,
anova=self.anova, drop_sum=self.drop_sum, anova=self.anova, drop_sum=self.drop_sum,
norm_mean=self.norm_mean, norm_std=self.norm_std, norm_mean=self.norm_mean, norm_std=self.norm_std,
strategy=self.strategy, alphabet=self.alphabet) strategy=self.strategy, alphabet=self.alphabet,
smooth_idf=self.smooth_idf)
self.estimator.fit(X,y) self.estimator.fit(X,y)
return self return self
def predict(self,X): def predict(self,X):
X = self._format_X(X) X = self._format_X(X)
return self.estimator.predict(X) return self.estimator.predict(X)
def predict_proba(self,X): def predict_proba(self,X):
X = self._format_X(X) X = self._format_X(X)
return self.estimator.predict_proba(X) return self.estimator.predict_proba(X)
\ No newline at end of file
...@@ -13,17 +13,17 @@ from matplotlib import pyplot as plt ...@@ -13,17 +13,17 @@ from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
# # Define classes for representation methods # # Define classes for representation methods
# Here we define custom classes when necessary for the representation methods we will use inside pipelines during cross validation. # Here we define custom classes when necessary for the representation methods we will use inside pipelines during cross validation.
# #
# See corresponding modules documentation for documentation. # See corresponding modules documentation for documentation.
# #
# Pyts : https://pyts.readthedocs.io/ # Pyts : https://pyts.readthedocs.io/
# #
# MatrixProfile : https://matrixprofile.docs.matrixprofile.org/ # MatrixProfile : https://matrixprofile.docs.matrixprofile.org/
# In[2]: # In[2]:
#Gramian natively use PAA, reccurence don't, #Gramian natively use PAA, reccurence don't,
#that's why you'll see calls to PAA inside the Recurrence class but not in the Gramian #that's why you'll see calls to PAA inside the Recurrence class but not in the Gramian
class Gramian_transform(BaseEstimator, TransformerMixin): class Gramian_transform(BaseEstimator, TransformerMixin):
...@@ -33,11 +33,11 @@ class Gramian_transform(BaseEstimator, TransformerMixin): ...@@ -33,11 +33,11 @@ class Gramian_transform(BaseEstimator, TransformerMixin):
self.method = method self.method = method
self.cmap = plt.get_cmap('jet') self.cmap = plt.get_cmap('jet')
self.transformer = None self.transformer = None
def transform(self, X, y=None): def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series: if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X]) X = np.asarray([x.values for x in X])
X = np.asarray([self.transformer.transform(x.reshape(1,-1)) for x in X if x.shape[0] >= self.img_size]) X = np.asarray([self.transformer.transform(x.reshape(1,-1)) for x in X if x.shape[0] >= self.img_size])
if self.flatten == True: if self.flatten == True:
X = X.reshape(X.shape[0], X.shape[2]) X = X.reshape(X.shape[0], X.shape[2])
...@@ -45,14 +45,14 @@ class Gramian_transform(BaseEstimator, TransformerMixin): ...@@ -45,14 +45,14 @@ class Gramian_transform(BaseEstimator, TransformerMixin):
X = X.reshape(X.shape[0], self.img_size, self.img_size, 1) X = X.reshape(X.shape[0], self.img_size, self.img_size, 1)
X = self.cmap(X)[:,:,:,:,0:3].reshape(X.shape[0],self.img_size, self.img_size,3) X = self.cmap(X)[:,:,:,:,0:3].reshape(X.shape[0],self.img_size, self.img_size,3)
return X return X
def fit(self, X, y=None): def fit(self, X, y=None):
self.transformer = GramianAngularField(image_size=self.img_size, self.transformer = GramianAngularField(image_size=self.img_size,
method=self.method, method=self.method,
flatten=self.flatten) flatten=self.flatten)
self.transformer.fit(X) self.transformer.fit(X)
return self return self
class Recurrence_transform(BaseEstimator, TransformerMixin): class Recurrence_transform(BaseEstimator, TransformerMixin):
def __init__(self, output_size=128, dimension=1, time_delay=6, flatten=False): def __init__(self, output_size=128, dimension=1, time_delay=6, flatten=False):
self.output_size = output_size self.output_size = output_size
...@@ -61,11 +61,11 @@ class Recurrence_transform(BaseEstimator, TransformerMixin): ...@@ -61,11 +61,11 @@ class Recurrence_transform(BaseEstimator, TransformerMixin):
self.time_delay = time_delay self.time_delay = time_delay
self.cmap = plt.get_cmap('jet') self.cmap = plt.get_cmap('jet')
self.transformer = None self.transformer = None
def transform(self, X, y=None): def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series: if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X]) X = np.asarray([x.values for x in X])
X = np.asarray([self.approximator.transform(x.reshape(1,-1))for x in X if x.shape[0] >= self.output_size]) X = np.asarray([self.approximator.transform(x.reshape(1,-1))for x in X if x.shape[0] >= self.output_size])
X = np.asarray([self.transformer.transform(x) for x in X if x.shape[0]]) X = np.asarray([self.transformer.transform(x) for x in X if x.shape[0]])
if self.flatten == True: if self.flatten == True:
...@@ -77,7 +77,7 @@ class Recurrence_transform(BaseEstimator, TransformerMixin): ...@@ -77,7 +77,7 @@ class Recurrence_transform(BaseEstimator, TransformerMixin):
def fit(self, X, y=None): def fit(self, X, y=None):
self.approximator = PiecewiseAggregateApproximation(output_size=self.output_size, self.approximator = PiecewiseAggregateApproximation(output_size=self.output_size,
window_size=None, window_size=None,
overlapping=False) overlapping=False)
self.approximator.fit(X) self.approximator.fit(X)
self.transformer = RecurrencePlot(dimension=self.dimension, self.transformer = RecurrencePlot(dimension=self.dimension,
...@@ -85,51 +85,51 @@ class Recurrence_transform(BaseEstimator, TransformerMixin): ...@@ -85,51 +85,51 @@ class Recurrence_transform(BaseEstimator, TransformerMixin):
flatten=self.flatten) flatten=self.flatten)
self.transformer.fit(X) self.transformer.fit(X)
return self return self
class PiecewiseApproximation_transform(BaseEstimator, TransformerMixin): class PiecewiseApproximation_transform(BaseEstimator, TransformerMixin):
def __init__(self, output_size=1000, overlapping=False, window_size=None): def __init__(self, output_size=1000, overlapping=False, window_size=None):
self.output_size = output_size self.output_size = output_size
self.overlapping = overlapping self.overlapping = overlapping
self.window_size = window_size self.window_size = window_size
self.transformer = None self.transformer = None
def transform(self, X, y=None): def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series: if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X]) X = np.asarray([x.values for x in X])
X = np.asarray([self.transformer.transform(x.reshape(1,-1)) for x in X if x.shape[0] >= self.output_size]) X = np.asarray([self.transformer.transform(x.reshape(1,-1)) for x in X if x.shape[0] >= self.output_size])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1]) X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X return X
def fit(self, X, y=None): def fit(self, X, y=None):
self.transformer = PiecewiseAggregateApproximation(output_size=self.output_size, self.transformer = PiecewiseAggregateApproximation(output_size=self.output_size,
window_size=self.window_size, window_size=self.window_size,
overlapping=self.overlapping) overlapping=self.overlapping)
self.transformer.fit(X) self.transformer.fit(X)
return self return self
class SymbolicAggregate_transform(BaseEstimator, TransformerMixin): class SymbolicAggregate_transform(BaseEstimator, TransformerMixin):
def __init__(self, n_bins=7, strategy='uniform', alphabet='ordinal'): def __init__(self, n_bins=5, strategy='uniform', alphabet='ordinal'):
self.n_bins = n_bins self.n_bins = n_bins
self.strategy = strategy self.strategy = strategy
self.alphabet = alphabet self.alphabet = alphabet
self.transformer = None self.transformer = None
def transform(self, X, y=None): def transform(self, X, y=None):
X = np.asarray([self.transformer.transform(x.reshape(1,-1)).astype(float) if np.max(x) - np.min(x) != 0 else np.zeros((1,x.shape[0])) for x in X]) X = np.asarray([self.transformer.transform(x.reshape(1,-1)).astype(float) if np.max(x) - np.min(x) != 0 else np.zeros((1,x.shape[0])) for x in X])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1]) X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X return X
def fit(self, X, y=None): def fit(self, X, y=None):
self.transformer = SymbolicAggregateApproximation(n_bins=self.n_bins, self.transformer = SymbolicAggregateApproximation(n_bins=self.n_bins,
strategy=self.strategy, strategy=self.strategy,
alphabet=self.alphabet) alphabet=self.alphabet)
self.transformer.fit(X) self.transformer.fit(X)
return self return self
class SymbolicFourrier_transform(BaseEstimator, TransformerMixin): class SymbolicFourrier_transform(BaseEstimator, TransformerMixin):
def __init__(self, n_coefs=20, n_bins=7, strategy='uniform', drop_sum=False, def __init__(self, n_coefs=10, n_bins=5, strategy='uniform', drop_sum=True,
anova=True, norm_mean=True, norm_std=False, alphabet='ordinal'): anova=True, norm_mean=False, norm_std=False, alphabet='ordinal'):
self.n_coefs = n_coefs self.n_coefs = n_coefs
self.n_bins = n_bins self.n_bins = n_bins
self.strategy = strategy self.strategy = strategy
...@@ -139,12 +139,12 @@ class SymbolicFourrier_transform(BaseEstimator, TransformerMixin): ...@@ -139,12 +139,12 @@ class SymbolicFourrier_transform(BaseEstimator, TransformerMixin):
self.norm_mean = norm_mean self.norm_mean = norm_mean
self.norm_std = norm_std self.norm_std = norm_std
self.transformer = None self.transformer = None
def transform(self, X, y=None): def transform(self, X, y=None):
X = np.asarray([self.transformer.transform(x.reshape(1,-1)).astype(float) if np.max(x) - np.min(x) != 0 else np.zeros((1,x.shape[0])) for x in X]) X = np.asarray([self.transformer.transform(x.reshape(1,-1)).astype(float) if np.max(x) - np.min(x) != 0 else np.zeros((1,x.shape[0])) for x in X])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1]) X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X return X
def fit(self, X, y): def fit(self, X, y):
self.transformer = SymbolicFourierApproximation(n_coefs=self.n_coefs, n_bins=self.n_bins, self.transformer = SymbolicFourierApproximation(n_coefs=self.n_coefs, n_bins=self.n_bins,
strategy=self.strategy, alphabet=self.alphabet, strategy=self.strategy, alphabet=self.alphabet,
...@@ -153,24 +153,24 @@ class SymbolicFourrier_transform(BaseEstimator, TransformerMixin): ...@@ -153,24 +153,24 @@ class SymbolicFourrier_transform(BaseEstimator, TransformerMixin):
X = X.reshape(X.shape[0],X.shape[1]) X = X.reshape(X.shape[0],X.shape[1])
self.transformer.fit(X,y) self.transformer.fit(X,y)
return self return self
class MatrixProfile_transform(): class MatrixProfile_transform():
def __init__(self, window_size=0.075): def __init__(self, window_size=0.15):
self.window_size = window_size self.window_size = window_size
def transform(self, X, y=None): def transform(self, X, y=None):
if type(X[0]) == pd.core.series.Series: if type(X[0]) == pd.core.series.Series:
X = np.asarray([x.values for x in X]) X = np.asarray([x.values for x in X])
X = np.asarray([mp.compute(x.reshape(-1),windows=x.shape[0]*self.window_size)['mp'].reshape(1,-1) for x in X]) X = np.asarray([mp.compute(x.reshape(-1),windows=x.shape[0]*self.window_size)['mp'].reshape(1,-1) for x in X])
X = X.reshape(X.shape[0], X.shape[2], X.shape[1]) X = X.reshape(X.shape[0], X.shape[2], X.shape[1])
return X return X
def fit(self, X, y=None): def fit(self, X, y=None):
return self return self
class ROCKET_transform(BaseEstimator, TransformerMixin): class ROCKET_transform(BaseEstimator, TransformerMixin):
def __init__(self, n_kernels=15000, kernel_sizes=(5,7,9), flatten=False): def __init__(self, n_kernels=20000, kernel_sizes=(5,7,9,11), flatten=False):
self.flatten = flatten self.flatten = flatten
self.n_kernels = n_kernels self.n_kernels = n_kernels
self.kernel_sizes = kernel_sizes self.kernel_sizes = kernel_sizes
...@@ -184,7 +184,7 @@ class ROCKET_transform(BaseEstimator, TransformerMixin): ...@@ -184,7 +184,7 @@ class ROCKET_transform(BaseEstimator, TransformerMixin):
else: else:
X = X.reshape(X.shape[0], X.shape[1], 1) X = X.reshape(X.shape[0], X.shape[1], 1)
return X return X
def fit(self, X, y=None): def fit(self, X, y=None):
self.transformer = ROCKET(n_kernels=self.n_kernels, kernel_sizes=self.kernel_sizes) self.transformer = ROCKET(n_kernels=self.n_kernels, kernel_sizes=self.kernel_sizes)
X = X.reshape(X.shape[0],X.shape[1]) X = X.reshape(X.shape[0],X.shape[1])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment