Commit 78d37186 authored by Elias Elias's avatar Elias Elias

remove all files from git tracking

parent 8ca184ea
cmake_minimum_required(VERSION 3.16)
project(similarity LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -g -O3 -march=native -funroll-loops -fno-omit-frame-pointer -fopenmp")
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE)
endif()
set(MAIN_SOURCE_FILES
src/utils.cpp
src/clust.cpp
src/lifelines.cpp
src/ts.cpp
src/distances.cpp
src/dba.cpp
src/kmeans.cpp
src/cah.cpp
src/main.cpp
)
add_executable(similarity.exe ${MAIN_SOURCE_FILES})
target_include_directories(similarity.exe PRIVATE
${PROJECT_SOURCE_DIR}/include
)
find_path(BAYESOPT_INCLUDE_DIR
"bayesopt/bayesoptbase.hpp"
HINTS /usr/local/include
)
find_path(NLOPT_INCLUDE_DIR
"nlopt.hpp"
HINTS /usr/local/include /usr/include
)
if(NOT BAYESOPT_INCLUDE_DIR)
message(FATAL_ERROR "Headers BayesOpt introuvables dans /usr/local/include")
endif()
if(NOT NLOPT_INCLUDE_DIR)
message(FATAL_ERROR "Headers NLopt introuvables (vérifier l'installation)")
endif()
target_include_directories(similarity.exe PRIVATE
${BAYESOPT_INCLUDE_DIR}
${NLOPT_INCLUDE_DIR}
)
find_library(BAYESOPT_LIB
NAMES bayesopt
HINTS /usr/local/lib
)
find_library(NLOPT_LIB
NAMES nlopt
HINTS /usr/local/lib /usr/lib
)
if(NOT BAYESOPT_LIB)
message(FATAL_ERROR "Bibliothèque BayesOpt introuvable dans /usr/local/lib")
endif()
if(NOT NLOPT_LIB)
message(FATAL_ERROR "Bibliothèque NLopt introuvable dans /usr/local/lib ou /usr/lib")
endif()
find_package(OpenMP REQUIRED)
target_link_libraries(similarity.exe PRIVATE
OpenMP::OpenMP_CXX
${BAYESOPT_LIB}
${NLOPT_LIB}
pthread
dl
m
)
Make sure to put csv data file in data/ folder
Make sure to install both libraries in simseqcomp/external :
- mkdir external
- cd external
- git clone https://github.com/stevengj/nlopt.git
- cd nlopt
- mkdir build
- cd build
- cmake .. -DBUILD_SHARED_LIBS=OFF
- make -j
Also in external :
- git clone https://github.com/rmcantin/bayesopt
- cd bayesopt
- mkdir build
- cd build
- cmake -S .. -B build -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_CXX_STANDARD=11 -DCMAKE_CXX_STANDARD_REQUIRED=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=$HOME/local
- conda install -c conda-forge nlohmann_json
In simseqcomp :
- mkdir build
- cd build
- cmake -S .. -B build -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_CXX_STANDARD=11 -DCMAKE_CXX_STANDARD_REQUIRED=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=$HOME/local
#pragma once
#include <vector>
#include "clust.hpp"
#include "ts.hpp"
using namespace std;
class Hierarchical : public ICluster {
public:
vector<vector<int>> clusters_;
vector<int> labels_;
double sil_ = 0.0;
double coph_corr_ = 0.0;
enum LinkageType { SINGLE,
COMPLETE,
AVERAGE };
Hierarchical(size_t k, double tau, double pt, double pe,
int sigma, DISTANCE mode);
void fit(const std::vector<TimedSequence>& data, vector<double> delta_cond, LinkageType type);
vector<int> fit_predict(const vector<TimedSequence>& data, vector<double> delta_cond, LinkageType type);
void print_clusters() const;
double silhouette(const vector<TimedSequence>& data, vector<double> delta_cond);
double copheneticCorrelation(
const vector<vector<double>>& distances,
const vector<vector<double>>& coph) const;
private:
struct Merge {
size_t idx1;
size_t idx2;
double distance;
};
std::vector<Merge> history_;
size_t k_ = 0;
double tau_ = 0.0;
double pt_ = 0.0;
double pe_ = 0.0;
int sigma_ = 0;
};
#pragma once
#include <vector>
#include "distances.hpp"
using namespace std;
class ICluster {
protected:
DISTANCE mode_;
ICluster(DISTANCE mode);
virtual ~ICluster() = default;
};
class Naive : public ICluster {
public:
size_t k_ = 0;
vector<pair<size_t, size_t>> links_;
vector<vector<int>> clusters_;
vector<int> labels_;
Naive(double tau, double pt, double pe,
int sigma, DISTANCE mode);
void fit(const vector<TimedSequence> data, vector<double> delta_cond);
void print_clusters() const;
vector<double> silhouette(const vector<TimedSequence>& data, vector<double> delta_cond);
private:
double tau_ = 0.0;
double pt_ = 0.0;
double pe_ = 0.0;
double delta_ = 0.0;
int sigma_ = 0;
};
#pragma once
#include "ts.hpp"
TimedSequence dba(const vector<TimedSequence>& ts, size_t max_iter, double tau, double pt, double pe, int sigma, vector<double> delta_cond, mt19937 rng);
#pragma once
#include <vector>
#include "ts.hpp"
using namespace std;
enum DISTANCE { EUCLIDEAN,
MANHATTAN,
DTW,
DROP_DTW };
double euclidean(size_t n, size_t m, const vector<double>& x, const vector<double>& y);
double manhattan(size_t n, size_t m, const vector<double>& x, const vector<double>& y);
struct DROP {
vector<double> dz_;
vector<double> dx_;
vector<double> C_;
vector<double> Dzx_;
vector<double> Dzm_;
vector<double> Dmx_;
vector<double> Dmm_;
vector<double> D_part_;
vector<double> D_;
vector<double> D_align_;
vector<double> D_drop_;
vector<char> M_;
vector<char> pi_;
double delta_ = 0.0;
vector<double> delta_cond_;
vector<double> delta_cost_;
size_t n_ = 0;
size_t m_ = 0;
size_t n_max_ = 0;
size_t m_max_ = 0;
int i_begin_ = -1;
int j_begin_ = -1;
DROP(const size_t n, const size_t m, vector<double> delta_cond);
void reset_mat(size_t n, size_t m);
double disttime(const double t1, const double t2);
double distevent(const size_t nevent,
const double pt, const double pe,
const vector<double>& w,
const vector<double>& da,
const vector<double>& db,
double ta, double tb);
void cost(size_t n, size_t m,
size_t nevent,
const double pt, const double pe,
const TimedSequence& s1,
const TimedSequence& s2);
void drop_cost(size_t n, size_t m,
size_t nevent,
const double pt, const double pe,
const TimedSequence& s1,
const TimedSequence& s2);
double traceback(size_t n, size_t m);
void print_traceback_path(size_t n, size_t m);
double distance(
size_t n, size_t m,
const double tau,
const int sigma,
const vector<double>& tZ,
const vector<double>& tX);
DROP() {} // constructeur par défaut
double compute(const double tau,
const double pt, const double pe,
const int sigma,
const TimedSequence& s1,
const TimedSequence& s2);
};
double dtw(const TimedSequence& s1,
const TimedSequence& s2,
DISTANCE mode);
#pragma once
#include <omp.h>
#include <random>
#include <vector>
#include "clust.hpp"
#include "distances.hpp"
#include "ts.hpp"
#include "utils.hpp"
class KMeans : public ICluster {
public:
vector<TimedSequence> cluster_centers_;
vector<int> labels_;
size_t n_iter_ = 0;
double inertia_ = 0.0;
KMeans(size_t k, size_t max_iter, size_t barycenter_iter,
double tau, double pt, double pe,
int sigma, DISTANCE mode, mt19937& rng);
void set_seed(unsigned seed);
mt19937 get_rng() const;
void set_rng(const std::mt19937& r);
void fit(const vector<TimedSequence>& data, const vector<double>& delta_cond, const double th);
vector<int> predict(const vector<TimedSequence>& data, vector<double> delta_cond);
vector<int> fit_predict(const vector<TimedSequence>& data, const vector<double>& delta_cond, const double th);
void print_cluster() const;
vector<double> distances(TimedSequence& ts, vector<double> delta_cond);
double score(const vector<TimedSequence>& data, vector<double> delta_cond);
double silhouette(const vector<TimedSequence>& data, vector<double> delta_cond);
private:
size_t k_ = 0;
size_t max_iter_ = 0;
size_t barycenter_iter_ = 0;
double tau_ = 0.0;
double pt_ = 0.0;
double pe_ = 0.0;
int sigma_ = 0;
mt19937& rng_;
double max_fdist_ = 0.0;
};
#pragma once
#include "utils.hpp"
double lifelines(
const size_t k,
const CsvTable& table,
const vector<int>& labels,
const string& output_csv,
const string& output_png,
bool plot);
#pragma once
#include <iostream>
#include <map>
#include <string>
#include <vector>
#include "utils.hpp"
using namespace std;
extern int ts_id_max;
class TimedSequence {
public:
int id;
vector<string> features_name;
vector<double> events;
vector<double> times;
size_t len = 0;
size_t nevent = 0;
double t_min = 0.0;
double t_max = 0.0;
TimedSequence();
TimedSequence(int ident,
const vector<string>& fn,
const vector<double>& e,
const vector<double>& t);
bool operator==(const TimedSequence& ts) const;
bool operator!=(const TimedSequence& ts) const;
void print() const;
};
using SequenceMap = map<string, TimedSequence>;
SequenceMap build_timed_sequences(const CsvTable& table,
std::unordered_map<std::string, NormParam>& params_feat,
NormParam& param_time,
bool normalize);
#pragma once
#include <iomanip>
#include <iostream>
#include <random>
#include <type_traits>
#include <unordered_map>
#include <vector>
using namespace std;
using CsvRow = unordered_map<string, string>;
using CsvTable = vector<CsvRow>;
struct Stats {
double sum = 0.0, sum2 = 0.0;
double min = numeric_limits<double>::infinity();
double max = -numeric_limits<double>::infinity();
double count = 0.0;
};
struct NormParam {
double mean = 0.0;
double std = 0.0;
double min = 0.0;
double max = 0.0;
};
template <typename T>
void print_mtx(size_t n, size_t m, const std::vector<T>& a, int width = 12, int precision = 3) {
std::ios oldState(nullptr);
oldState.copyfmt(std::cout);
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < m; ++j) {
const T& val = a[i * m + j];
if constexpr (std::is_same_v<T, char>) {
std::cout << std::setw(width) << static_cast<int>(val);
} else if constexpr (std::is_floating_point_v<T>) {
std::cout
//<< std::fixed << std::scientific
<< std::fixed
<< std::setprecision(precision)
<< std::setw(width)
<< val;
} else {
std::cout << std::setw(width) << val;
}
}
std::cout << '\n';
}
std::cout.copyfmt(oldState);
}
double maxval(const double a, const double b);
double minval(const double a, const double b);
double smoothMin(const std::vector<double>& x, double gamma);
double percentile_mtx(const vector<double>& a, const double p);
double mean(const vector<double>& a);
double stddev(const vector<double>& a);
void z_norm(vector<double>& a);
CsvTable load_csv(const string& filename,
const vector<string>& headers,
char delimiter = ',');
CsvTable filter_features(const CsvTable& table,
const vector<string>& features_to_keep);
import sys
import numpy as np
from sklearn.metrics import silhouette_score
dist_file = sys.argv[1]
labels_file = sys.argv[2]
distances = np.loadtxt(dist_file, delimiter=",")
labels = np.loadtxt(labels_file, dtype=int)
score = silhouette_score(distances, labels, metric="precomputed")
with open("../assets/tmp_score.txt", "w") as f:
f.write(str(score))
import sys
import numpy as np
import matplotlib.pyplot as plt
import umap
# fichiers CSV générés par le C++
dist_file = sys.argv[1]
labels_file = sys.argv[2]
ids_file = sys.argv[3]
output_file = sys.argv[4] # nom du fichier PNG à sauvegarder
# lecture des fichiers
distances = np.loadtxt(dist_file, delimiter=",")
labels = np.loadtxt(labels_file, dtype=int)
point_ids = [line.strip() for line in open(ids_file)]
# UMAP avec distance pré-calculée
reducer = umap.UMAP(n_components=2, metric='precomputed', random_state=42)
embedding = reducer.fit_transform(distances)
plt.figure(figsize=(8, 6))
# scatter avec colormap et valeurs numériques pour clusters
sc = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap='viridis',
s=50, alpha=0.7)
# barre de couleur verticale pour les clusters
cbar = plt.colorbar(sc)
cbar.set_label("Cluster", rotation=270, labelpad=15)
# ajout des IDs sur le plot
for i, pid in enumerate(point_ids):
plt.text(embedding[i, 0], embedding[i, 1], pid, fontsize=8,
ha='center', va='center')
plt.title("UMAP projection des clusters Drop-DTW")
plt.grid(True)
plt.tight_layout()
# sauvegarde dans le fichier passé en paramètre
plt.savefig(output_file, dpi=300)
print(f"UMAP plot saved to {output_file}")
#include "cah.hpp"
#include <cmath>
#include <limits>
#include <stdexcept>
#include <vector>
#include "clust.hpp"
#include "distances.hpp"
Hierarchical::Hierarchical(size_t k, double tau, double pt, double pe,
int sigma, DISTANCE mode)
: ICluster(mode),
k_(k),
tau_(tau),
pt_(pt),
pe_(pe),
sigma_(sigma) {}
void Hierarchical::fit(const vector<TimedSequence>& data, vector<double> delta_cond, LinkageType type) {
size_t n = data.size();
if (k_ < 1 || k_ > n)
throw invalid_argument("k between 1 and n");
vector<vector<double>> distances(n, vector<double>(n, 0.0));
for (size_t i = 0; i < n; ++i) {
const TimedSequence& s1 = data[i];
for (size_t j = i + 1; j < n; ++j) {
const TimedSequence& s2 = data[j];
DROP buf(s1.len, s2.len, delta_cond);
double d = buf.compute(tau_, pt_, pe_, sigma_, s1, s2);
distances[i][j] = distances[j][i] = d;
}
}
clusters_.assign(n, vector<int>(1));
for (size_t i = 0; i < n; ++i)
clusters_[i][0] = static_cast<int>(i);
vector<vector<double>> coph(n, vector<double>(n, 0.0));
history_.clear();
history_.reserve(n - 1);
while (clusters_.size() > 1) {
size_t bestI = 0, bestJ = 1;
double bestDist = numeric_limits<double>::infinity();
for (size_t i = 0; i < clusters_.size(); ++i) {
for (size_t j = i + 1; j < clusters_.size(); ++j) {
const auto& Ci = clusters_[i];
const auto& Cj = clusters_[j];
double dist_ij;
if (type == SINGLE) {
dist_ij = numeric_limits<double>::infinity();
for (int a : Ci) {
for (int b : Cj) {
double dij = distances[a][b];
if (isfinite(dij))
dist_ij = min(dist_ij, dij);
}
}
if (isinf(dist_ij))
continue;
} else if (type == COMPLETE) {
dist_ij = -numeric_limits<double>::infinity();
bool hasFinite = false;
for (int a : Ci) {
for (int b : Cj) {
double dij = distances[a][b];
if (isfinite(dij)) {
dist_ij = max(dist_ij, dij);
hasFinite = true;
}
}
}
if (!hasFinite)
continue;
} else {
double sum = 0.0;
size_t count = 0;
for (int a : Ci) {
for (int b : Cj) {
double dij = distances[a][b];
if (isfinite(dij)) {
sum += dij;
++count;
}
}
}
if (count == 0)
continue;
dist_ij = sum / count;
}
if (dist_ij < bestDist) {
bestDist = dist_ij;
bestI = i;
bestJ = j;
}
}
}
if (isinf(bestDist)) {
throw runtime_error(
"Unable to merge: all clusters are completely disconnected.");
}
history_.push_back({bestI, bestJ, bestDist});
for (int a : clusters_[bestI])
for (int b : clusters_[bestJ])
coph[a][b] = coph[b][a] = bestDist;
auto& Ci = clusters_[bestI];
auto& Cj = clusters_[bestJ];
Ci.insert(Ci.end(), Cj.begin(), Cj.end());
clusters_.erase(clusters_.begin() + bestJ);
}
vector<vector<int>> clusters_k(n);
for (size_t i = 0; i < n; ++i)
clusters_k[i] = {static_cast<int>(i)};
size_t merges_to_apply = min(n - k_, history_.size());
for (size_t m = 0; m < merges_to_apply; ++m) {
size_t i = history_[m].idx1;
size_t j = history_[m].idx2;
auto& Ai = clusters_k[i];
auto& Aj = clusters_k[j];
Ai.insert(Ai.end(), Aj.begin(), Aj.end());
clusters_k.erase(clusters_k.begin() + j);
}
clusters_.swap(clusters_k);
labels_.assign(n, 0);
for (size_t cid = 0; cid < clusters_.size(); ++cid)
for (int idx : clusters_[cid])
labels_[idx] = cid;
sil_ = silhouette(data, delta_cond);
coph_corr_ = copheneticCorrelation(distances, coph);
}
void Hierarchical::print_clusters() const {
cout << "Clusters (" << clusters_.size() << "):\n";
for (size_t i = 0; i < clusters_.size(); ++i) {
cout << " Cluster " << i << ": " << clusters_[i].size() << " patients\n";
}
}
double Hierarchical::silhouette(const vector<TimedSequence>& data, vector<double> delta_cond) {
const size_t n = data.size();
double sil = 0.0;
size_t count = 0;
for (size_t i = 0; i < n; ++i) {
int ci = labels_[i];
if (ci == -1) {
continue;
}
const TimedSequence& A = data[i];
double sum_a = 0.0;
double count_a = 0.0;
for (size_t j = 0; j < n; ++j) {
if (j == i || labels_[j] != ci || labels_[j] == -1) {
continue;
}
const TimedSequence& B = data[j];
DROP buf(A.len, B.len, delta_cond);
double dist_a = 0.0;
dist_a = buf.compute(tau_, pt_, pe_, sigma_, A, B);
if (!isfinite(dist_a)) {
continue;
}
sum_a += dist_a;
count_a += 1.0;
}
if (count_a == 0) {
count++;
continue;
}
double a_i = sum_a / count_a;
double b_i = numeric_limits<double>::infinity();
for (size_t oc = 0; oc < k_; ++oc) {
if (static_cast<int>(oc) == ci) {
continue;
}
double sum_b = 0.0;
double count_b = 0.0;
double dist_b = 0.0;
for (size_t j = 0; j < n; ++j) {
if (labels_[j] != static_cast<int>(oc) || labels_[j] == -1) {
continue;
}
const TimedSequence& B = data[j];
DROP buf(A.len, B.len, delta_cond);
dist_b = buf.compute(tau_, pt_, pe_, sigma_, A, B);
if (!isfinite(dist_b)) {
continue;
}
sum_b += dist_b;
count_b += 1.0;
}
if (count_b > 0.0) {
b_i = min(b_i, sum_b / count_b);
}
}
if (!isfinite(b_i)) {
b_i = 0.0;
}
double max_ab = max(a_i, b_i);
if (max_ab > 0) {
double sil_a = (b_i - a_i) / max_ab;
sil += sil_a;
}
count++;
};
return sil / static_cast<double>(count);
}
double Hierarchical::copheneticCorrelation(
const vector<vector<double>>& distances,
const vector<vector<double>>& coph) const {
size_t n = distances.size();
double sum_d = 0.0, sum_c = 0.0;
double sum_dd = 0.0, sum_cc = 0.0, sum_dc = 0.0;
size_t m = 0;
for (size_t i = 0; i < n; ++i) {
for (size_t j = i + 1; j < n; ++j) {
double di = distances[i][j];
if (isinf(di)) {
continue;
}
double ci = coph[i][j];
sum_d += di;
sum_c += ci;
sum_dd += di * di;
sum_cc += ci * ci;
sum_dc += di * ci;
++m;
}
}
double mean_d = sum_d / m;
double mean_c = sum_c / m;
double cov_dc = sum_dc / m - mean_d * mean_c;
double var_d = sum_dd / m - mean_d * mean_d;
double var_c = sum_cc / m - mean_c * mean_c;
return cov_dc / (sqrt(var_d) * sqrt(var_c));
}
#include "clust.hpp"
#include <numeric>
#include <queue>
ICluster::ICluster(DISTANCE mode)
:mode_(mode) {}
Naive::Naive(double tau, double pt, double pe,
int sigma, DISTANCE mode)
: ICluster(mode),
tau_(tau),
pt_(pt),
pe_(pe),
sigma_(sigma) {}
void Naive::fit(const vector<TimedSequence> data, vector<double> delta_cond) {
const size_t n = data.size();
links_.clear();
clusters_.clear();
labels_.clear();
links_.reserve(n);
for (size_t i = 0; i < n; ++i) {
double best_d = numeric_limits<double>::infinity();
size_t best_j = i;
const TimedSequence& s1 = data[i];
for (size_t j = 0; j < n; ++j) {
if (j == i) continue;
const TimedSequence& s2 = data[j];
DROP buf(s1.len, s2.len, delta_cond);
double d = buf.compute(tau_, pt_, pe_, sigma_, s1, s2);
if (d < best_d) {
best_d = d;
best_j = j;
}
}
links_.emplace_back(i, best_j);
}
vector<vector<int>> adj(n);
for (auto [i, j] : links_) {
adj[i].push_back(j);
adj[j].push_back(i);
}
vector<bool> seen(n, false);
for (size_t i = 0; i < n; ++i) {
if (seen[i]) continue;
vector<int> comp;
queue<size_t> q;
q.push(i);
seen[i] = true;
while (!q.empty()) {
size_t u = q.front();
q.pop();
comp.push_back(u);
for (size_t v : adj[u]) {
if (!seen[v]) {
seen[v] = true;
q.push(v);
}
}
}
clusters_.push_back(move(comp));
}
labels_.resize(n);
for (size_t c = 0; c < clusters_.size(); ++c) {
for (size_t idx : clusters_[c]) {
labels_[idx] = c;
}
}
k_ = clusters_.size();
}
void Naive::print_clusters() const {
for (size_t c = 0; c < clusters_.size(); ++c) {
auto& comp = clusters_[c];
cout << " Cluster " << c
<< " (n=" << comp.size() << "): ";
for (size_t idx : comp) {
cout << idx << " ";
}
cout << "\n";
}
}
vector<double> Naive::silhouette(const vector<TimedSequence>& data, vector<double> delta_cond) {
const size_t n = data.size();
const size_t k = clusters_.size();
vector<vector<double>> cluster_scores(k);
for (size_t i = 0; i < n; ++i) {
if (labels_[i] == static_cast<int>(k)) {
continue;
}
int ci = labels_[i];
const TimedSequence& A = data[i];
double sum_a = 0.0;
double count_a = 0.0;
for (size_t j = 0; j < n; ++j) {
if (j == i || labels_[j] != ci || labels_[j] == static_cast<int>(k)) {
continue;
}
const TimedSequence& B = data[j];
DROP buf(A.len, B.len, delta_cond);
sum_a += buf.compute(tau_, pt_, pe_, sigma_, A, B);
count_a += 1.0;
}
double a_i = (count_a > 0.0 ? sum_a / count_a : 0.0);
double b_i = numeric_limits<double>::infinity();
for (size_t oc = 0; oc < k; ++oc) {
if (static_cast<int>(oc) == ci) {
continue;
}
double sum_b = 0.0, count_b = 0.0;
for (size_t j = 0; j < n; ++j) {
if (labels_[j] != static_cast<int>(oc) || labels_[j] == static_cast<int>(k)) {
continue;
}
const TimedSequence& B = data[j];
DROP buf(A.len, B.len, delta_cond);
sum_b += buf.compute(tau_, pt_, pe_, sigma_, A, B);
count_b += 1.0;
}
if (count_b > 0.0) {
b_i = min(b_i, sum_b / count_b);
}
}
if (!isfinite(b_i)) {
b_i = 0.0;
}
double max_ab = max(a_i, b_i);
double s_i = (max_ab > 0.0 ? (b_i - a_i) / max_ab : 0.0);
cluster_scores[ci].push_back(s_i);
}
vector<double> sil(k, -1.0);
for (size_t c = 0; c < k; ++c) {
auto& scores = cluster_scores[c];
if (!scores.empty()) {
double sum = accumulate(scores.begin(), scores.end(), 0.0);
sil[c] = sum / static_cast<double>(scores.size());
}
}
return sil;
}
\ No newline at end of file
#include "dba.hpp"
#include "distances.hpp"
TimedSequence dba(const vector<TimedSequence>& ts, size_t max_iter, double tau, double pt, double pe, int sigma, vector<double> delta_cond, mt19937 rng) {
size_t ts_size = ts.size();
if (ts_size == 0) {
throw invalid_argument("Sequences list is empty");
} else if (ts_size == 1) {
return ts[0];
}
size_t nevent0 = ts[0].nevent;
float temp_len = 0;
size_t keep = 0;
size_t max_len = 0;
size_t min_len = -1;
for (size_t i = 1; i < ts_size; ++i) {
if (ts[i].nevent != nevent0) {
cout << ts[i].nevent << " != " << nevent0 << endl;
cout << ts[i].id << endl;
throw invalid_argument("All sequences must have the same number of event attributes");
}
if (ts[i].len > max_len) {
max_len = ts[i].len;
}
if (ts[i].len < min_len) {
min_len = ts[i].len;
}
temp_len += ts[i].len;
}
temp_len /= (ts_size * 1.0);
float temp_min = numeric_limits<float>::infinity();
for (size_t i = 0; i < ts_size; ++i) {
float abs_len = abs(ts[i].len - temp_len);
if (abs_len < temp_min) {
temp_min = abs_len;
keep = i;
}
}
TimedSequence avg = ts[keep];
const size_t nevent = avg.nevent;
size_t n = avg.len;
vector<double> sum_events(n * nevent, 0.0);
vector<double> sum_times(n, 0.0);
vector<int> counts(n, 0);
uniform_int_distribution<size_t> dist(0, ts_size - 1);
bool changed = true;
size_t iter = 0;
for (iter = 0; iter < max_iter && changed; ++iter) {
TimedSequence prev_avg = avg;
n = avg.len;
bool added = false;
sum_events.assign(n * nevent, 0.0);
sum_times.assign(n, 0.0);
counts.assign(n, 0);
size_t count = 0;
for (auto const& seq : ts) {
size_t m = seq.len;
DROP buf(avg.len, seq.len, delta_cond);
count++;
auto dist = buf.compute(tau, pt, pe, sigma, avg, seq);
// cout << dist << endl;
if (!isfinite(dist)) {
continue;
}
int i = static_cast<int>(buf.i_begin_) - 1;
int j = static_cast<int>(buf.j_begin_) - 1;
while (i > -1 && j > -1) {
int state = buf.M_[i * m + j];
switch (state) {
case 1: {
// match : moyenne avg[i-1] & seq[j-1]
for (size_t o = 0; o < nevent; ++o)
sum_events[i * nevent + o] += avg.events[i * nevent + o] + seq.events[j * nevent + o];
sum_times[i] += avg.times[i] + seq.times[j];
counts[i] += 2;
added = true;
break;
}
case 2: {
// drop X: ne prend que avg[i-1]
for (size_t o = 0; o < nevent; ++o)
sum_events[i * nevent + o] += avg.events[i * nevent + o];
sum_times[i] += avg.times[i];
counts[i] += 1;
added = true;
break;
}
case 3: {
// drop Z: ne prend que seq[j-1]
for (size_t o = 0; o < nevent; ++o)
sum_events[i * nevent + o] += seq.events[j * nevent + o];
sum_times[i] += seq.times[j];
counts[i] += 1;
added = true;
break;
}
case 4: {
break;
}
case 5: {
for (size_t o = 0; o < nevent; ++o) {
double w = exp((abs(avg.events[i * nevent + o] - seq.events[j * nevent + o]) - buf.delta_cond_[o]));
sum_events[i * nevent + o] += (avg.events[i * nevent + o] + seq.events[j * nevent + o]) * w;
}
sum_times[i] += avg.times[i] + seq.times[j];
counts[i] += 2;
added = true;
break;
}
default:
i = -1;
j = -1; // terminaison
}
double diag = buf.D_[i * (m + 1) + j];
double left = buf.D_[(i + 1) * (m + 1) + j];
double top = buf.D_[i * (m + 1) + (j + 1)];
if (left < diag) {
j--;
} else if (top < diag) {
i--;
} else {
i--;
j--;
}
}
}
if (!added) {
avg = ts[dist(rng)];
} else {
vector<double> merged_events;
vector<double> merged_times;
merged_events.reserve(n * nevent);
merged_times.reserve(n);
size_t idx = 0;
while (idx < n) {
if (counts[idx] == 0) {
idx++;
continue;
}
double t0 = sum_times[idx] / counts[idx];
vector<double> raw_sum_ev(
sum_events.begin() + idx * nevent,
sum_events.begin() + idx * nevent + nevent);
double raw_sum_t = sum_times[idx];
int total_count = counts[idx];
size_t j = idx + 1;
while (j < n && abs((sum_times[j] / counts[j]) - t0) <= 1e-9) {
for (size_t o = 0; o < nevent; ++o) {
raw_sum_ev[o] += sum_events[j * nevent + o];
}
raw_sum_t += sum_times[j];
total_count += counts[j];
++j;
}
for (size_t o = 0; o < nevent; ++o) {
double avg_e = raw_sum_ev[o] / total_count;
merged_events.push_back(avg_e);
}
double avg_t = raw_sum_t / total_count;
merged_times.push_back(avg_t);
idx = j;
}
avg = TimedSequence(ts_id_max, avg.features_name, merged_events, merged_times);
ts_id_max++;
}
n = avg.len;
changed = (avg != prev_avg);
}
return avg;
}
\ No newline at end of file
#include "distances.hpp"
#include <unistd.h>
#include <algorithm>
#include <cassert>
#include <cmath>
#include <iomanip>
#include <iostream>
#include "utils.hpp"
double euclidean(size_t n, size_t m, const vector<double>& x, const vector<double>& y) {
assert(n == m);
double sum = 0;
for (size_t i = 0; i < n; ++i)
sum += (x[i] - y[i]) * (x[i] - y[i]);
return std::sqrt(sum);
}
double manhattan(size_t n, size_t m, const vector<double>& x, const vector<double>& y) {
if (n != m) {
throw runtime_error("manhattan : dimension doesn't match");
}
double dist = 0.0;
for (size_t i = 0; i < n; ++i) {
dist += (x[i] - y[i]);
}
return abs(dist);
}
double dtw(const TimedSequence& s1,
const TimedSequence& s2,
DISTANCE mode) {
size_t n = s1.len;
size_t m = s2.len;
size_t nevent = s1.nevent;
vector<double> D((n + 1) * (m + 1), numeric_limits<double>::infinity());
D[0] = 0.0;
for (size_t i = 1; i <= n; ++i) {
vector<double> s1_e(
s1.events.begin() + (i - 1) * nevent,
s1.events.begin() + (i - 1) * nevent + nevent);
for (size_t j = 1; j <= m; ++j) {
vector<double> s2_e(
s1.events.begin() + (j - 1) * nevent,
s1.events.begin() + (j - 1) * nevent + nevent);
double cost = 0.0;
switch (mode) {
case EUCLIDEAN:
cost = euclidean(n, m, s1_e, s2_e);
break;
default:
cost = manhattan(n, m, s1_e, s2_e);
break;
}
cost += abs(s1.times[i] - s2.times[j]);
double bestPrev = min({D[(i - 1) * m + j],
D[i * m + (j - 1)],
D[(i - 1) * m + (j - 1)]});
D[i * m + j] = cost + bestPrev;
}
}
return D[n * n + m];
}
DROP::DROP(const size_t n, const size_t m, vector<double> delta_cond)
: dz_(n + 1, 0.0),
dx_(m + 1, 0.0),
C_(n * m, 0.0),
Dzx_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
Dzm_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
Dmx_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
Dmm_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
D_part_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
D_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
D_align_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
D_drop_((n + 1) * (m + 1), numeric_limits<double>::infinity()),
M_(n * m, 0),
pi_((n + 1) * (m + 1), -1),
delta_cond_(delta_cond),
delta_cost_(n * m, 0.0),
n_(n),
m_(m) {}
void DROP::reset_mat(size_t n, size_t m) {
n_ = n;
m_ = m;
dz_.assign(n + 1, delta_);
dx_.assign(m + 1, delta_);
C_.assign(n * m, 0.0);
delta_cost_.assign(n * m, 0.0);
M_.assign(n * m, 0);
pi_.assign((n + 1) * (m + 1), -1);
Dzx_.assign((n + 1) * (m + 1), numeric_limits<double>::infinity());
Dzm_.assign((n + 1) * (m + 1), numeric_limits<double>::infinity());
Dmx_.assign((n + 1) * (m + 1), numeric_limits<double>::infinity());
Dmm_.assign((n + 1) * (m + 1), numeric_limits<double>::infinity());
D_part_.assign((n + 1) * (m + 1), numeric_limits<double>::infinity());
D_.assign((n + 1) * (m + 1), numeric_limits<double>::infinity());
}
inline double DROP::disttime(const double t1, const double t2) {
return abs(t1 - t2);
}
double DROP::distevent(const size_t nevent,
const double pt, const double pe,
const vector<double>& w,
const vector<double>& da,
const vector<double>& db,
double ta, double tb) {
double d = 0.0;
double t_sq = 0.0;
for (size_t i = 0; i < nevent; ++i) {
d += (((da[i] - db[i]) * w[i]) * ((da[i] - db[i]) * w[i]));
}
t_sq = (ta - tb) * (ta - tb);
return sqrt((pe * d) + (pt * t_sq));
}
void DROP::cost(size_t n, size_t m,
size_t nevent,
const double pt, const double pe,
const TimedSequence& s1,
const TimedSequence& s2) {
vector<double> buf(nevent, 1.0);
for (size_t i = 0; i < n; ++i) {
vector<double> s1_e(
s1.events.begin() + i * nevent,
s1.events.begin() + i * nevent + nevent);
double s1_t = s1.times[i];
for (size_t j = 0; j < m; ++j) {
vector<double> s2_e(
s2.events.begin() + j * nevent,
s2.events.begin() + j * nevent + nevent);
double s2_t = s2.times[j];
C_[i * m + j] = distevent(nevent, pt, pe, buf, s1_e, s2_e, s1_t, s2_t);
}
}
delta_ = percentile_mtx(C_, 20.0);
}
void DROP::drop_cost(size_t n, size_t m,
size_t nevent,
const double pt, const double pe,
const TimedSequence& s1,
const TimedSequence& s2) {
for (size_t i = 0; i < n; ++i) {
vector<double> s1_e(
s1.events.begin() + i * nevent,
s1.events.begin() + i * nevent + nevent);
double s1_t = s1.times[i];
for (size_t j = 0; j < m; ++j) {
vector<double> buf(nevent, 1.0);
size_t count = 0;
vector<double> s2_e(
s2.events.begin() + j * nevent,
s2.events.begin() + j * nevent + nevent);
double s2_t = s2.times[j];
for (size_t k = 0; k < nevent; ++k) {
if (delta_cond_[k] == 0) {
buf[k] = 1.0;
continue;
}
buf[k] = exp((abs(s1_e[k] - s2_e[k]) - delta_cond_[k]));
count++;
}
delta_cost_[i * m + j] = distevent(nevent, pt, pe, buf, s1_e, s2_e, s1_t, s2_t);
}
}
}
double DROP::traceback(size_t n, size_t m) {
size_t i = 0;
size_t j = 0;
if (isfinite(D_[n * (m + 1) + m])) {
i = n;
j = m;
} else {
int ni = 0;
int mj = 0;
float cn = 0;
float cm = 0;
for (size_t ii = n - 1; ii > 0; --ii) {
if (isfinite(D_[ii * (m + 1) + m])) {
ni = ii;
break;
}
cn++;
}
for (size_t jj = m - 1; jj > 0; --jj) {
if (isfinite(D_[n * (m + 1) + jj])) {
mj = jj;
break;
}
cm++;
}
if (ni == 0 && mj == 0) {
return numeric_limits<double>::infinity();
}
cn /= n;
cm /= m;
if (cn < cm) {
i = ni;
j = m;
} else if (cm < cn) {
i = n;
j = mj;
} else {
if (D_[n * (m + 1) + mj] < D_[ni * (m + 1) + m]) {
i = n;
j = mj;
} else {
i = ni;
j = m;
}
}
}
if (i == 0 && j == 0) {
return numeric_limits<double>::infinity();
}
i_begin_ = i;
j_begin_ = j;
while (i > 0 || j > 0) {
int state = pi_[i * (m + 1) + j];
if (isinf(D_[i * (m + 1) + j]) || state < 0)
break;
M_[(i - 1) * m + (j - 1)] = state + 1;
double diag = D_[(i - 1) * (m + 1) + (j - 1)];
double left = D_[i * (m + 1) + (j - 1)];
double top = D_[(i - 1) * (m + 1) + j];
if (left < diag) {
j--;
} else if (top < diag) {
i--;
} else {
i--;
j--;
}
}
return D_[i_begin_ * (m + 1) + j_begin_];
}
double DROP::distance(size_t n, size_t m,
const double tau,
const int sigma,
const vector<double>& tZ,
const vector<double>& tX) {
const double tau_abs = abs(tau);
Dzx_[0] = 0.0;
Dzm_[0] = 0.0;
Dmx_[0] = 0.0;
Dmm_[0] = 0.0;
D_[0] = 0.0;
D_part_[0] = 0.0;
for (size_t i = 1; i < n + 1; ++i) {
Dmx_[i * (m + 1)] = delta_ + Dmx_[(i - 1) * (m + 1)];
Dmm_[i * (m + 1)] = Dmx_[i * (m + 1)];
D_part_[i * (m + 1)] = Dmx_[i * (m + 1)];
D_[i * (m + 1)] = Dmx_[i * (m + 1)];
}
for (size_t j = 1; j < m + 1; ++j) {
Dzm_[j] = delta_ + Dzm_[j - 1];
Dmm_[j] = Dzm_[j];
D_part_[j] = Dzm_[j];
D_[j] = Dzm_[j];
}
for (size_t i = 1; i < n + 1; ++i) {
size_t offset_i = i * (m + 1);
size_t offset_i_1 = (i - 1) * (m + 1);
size_t offset_C = (i - 1) * m;
for (size_t j = 1; j < m + 1; ++j) {
if (disttime(tZ[i - 1], tX[j - 1]) > tau_abs) {
continue;
}
size_t idx = offset_i + j;
size_t idx_diag = offset_i_1 + (j - 1);
size_t idx_left = offset_i + (j - 1);
size_t idx_top = offset_i_1 + j;
size_t idx_C = offset_C + (j - 1);
double C_ij = C_[idx_C];
double delta_cost_ij = delta_cost_[idx_C];
double m_diag = min({Dzx_[idx_diag], Dzm_[idx_diag], Dmx_[idx_diag], Dmm_[idx_diag], D_part_[idx_diag]});
double m_left_with_z = min({Dzx_[idx_left], Dzm_[idx_left], D_part_[idx_left]});
double m_top_with_x = min({Dzx_[idx_top], Dmx_[idx_top], D_part_[idx_top]});
double m_left_no_z = min(Dmx_[idx_left], Dmm_[idx_left]);
double m_top_no_x = min(Dzm_[idx_top], Dmm_[idx_top]);
double v_zx = C_ij + min({m_diag, m_left_with_z, m_top_with_x});
double v_part = delta_cost_ij + min({m_diag, m_left_with_z, m_top_with_x});
double v_zm = delta_ + m_left_with_z;
double v_mx = delta_ + m_top_with_x;
double v_mm = min(m_top_no_x, m_left_no_z) + delta_;
char pi_min = 0;
double v_min = v_zx;
pi_min = (v_part < v_min) ? 4 : pi_min;
v_min = min(v_min, v_part);
pi_min = (v_zm < v_min) ? 1 : pi_min;
v_min = min(v_min, v_zm);
pi_min = (v_mx < v_min) ? 2 : pi_min;
v_min = min(v_min, v_mx);
pi_min = (v_mm < v_min) ? 3 : pi_min;
v_min = min(v_min, v_mm);
Dzx_[idx] = v_zx;
Dzm_[idx] = v_zm;
Dmx_[idx] = v_mx;
Dmm_[idx] = v_mm;
D_part_[idx] = v_part;
D_[idx] = v_min;
pi_[idx] = pi_min;
}
}
double dist = traceback(n, m);
// cout << "MTX C \n";
// print_mtx(n, m, C_, 12, 4);
// cout << "MTX S \n";
// print_mtx(n, m, delta_cost_, 12, 4);
// cout << "MTX Dzx\n";
// print_mtx(n + 1, m + 1, Dzx_, 12, 4);
// cout << "MTX Dzm\n";
// print_mtx(n + 1, m + 1, Dzm_, 12, 4);
// cout << "MTX Dmx\n";
// print_mtx(n + 1, m + 1, Dmx_, 12, 4);
// cout << "MTX Dmm\n";
// print_mtx(n + 1, m + 1, Dmm_, 12, 4);
// cout << "MTX Dp\n";
// print_mtx(n + 1, m + 1, D_part_, 12, 4);
// cout << "MTX D\n";
// print_mtx(n + 1, m + 1, D_, 12, 4);
// cout << "MTX PI\n";
// print_mtx(n + 1, m + 1, pi_, 4, 2);
// cout << "MTX M\n";
// print_mtx(n, m, M_, 4, 2);
// exit(0);
return dist;
}
double DROP::compute(const double tau,
const double pt, const double pe,
const int sigma,
const TimedSequence& s1,
const TimedSequence& s2) {
if (s1.nevent != s2.nevent) {
cout << s1.nevent << " " << s2.nevent << endl;
throw runtime_error("DROP::compute : dimension doesn't match");
}
const TimedSequence* A = &s1;
const TimedSequence* B = &s2;
size_t n = A->len;
size_t m = B->len;
size_t nevent = A->nevent;
int new_sigma = sigma;
cost(n, m, nevent, pt, pe, *A, *B);
drop_cost(n, m, nevent, pt, pe, *A, *B);
double result = distance(n, m, tau, new_sigma, A->times, B->times);
// print_mtx(n + 1, m + 1, D_, 12, 3);
// cout << endl;
// print_mtx(n + 1, m + 1, pi_, 4, 2);
// cout << endl;
// print_mtx(n, m, M_, 4, 2);
// cout << result << endl;
// exit(0);
return result;
}
\ No newline at end of file
#include "kmeans.hpp"
#include <algorithm>
#include <cassert>
#include <limits>
#include <numeric>
#include <optional>
#include <random>
#include <unordered_set>
#include <vector>
#include "dba.hpp"
#include "distances.hpp"
#include "utils.hpp"
KMeans::KMeans(size_t n_clusters, size_t max_iter, size_t barycenter_iter,
double tau, double pt, double pe,
int sigma, DISTANCE mode, mt19937& rng)
: ICluster(mode),
k_(n_clusters),
max_iter_(max_iter),
barycenter_iter_(barycenter_iter),
tau_(tau),
pt_(pt),
pe_(pe),
sigma_(sigma),
rng_(rng) {
if (k_ == 0) {
throw invalid_argument("KMeans: n_clusters must be > 0");
}
}
void KMeans::set_seed(unsigned seed) {
rng_.seed(seed);
}
mt19937 KMeans::get_rng() const {
return rng_;
}
void KMeans::set_rng(const mt19937& r) {
rng_ = r;
}
void KMeans::fit(const vector<TimedSequence>& data,
const vector<double>& delta_cond,
const double th) {
const size_t n = data.size();
if (n == 0) {
throw runtime_error("KMeans::fit : empty data");
}
if (k_ == 0 || k_ > n) {
throw runtime_error("KMeans::fit : k invalid");
}
int num_threads = omp_get_max_threads();
omp_set_num_threads(num_threads);
cluster_centers_.clear();
cluster_centers_.reserve(k_ + 1);
labels_.assign(n, -1);
uniform_int_distribution<size_t> uni(0, n - 1);
cluster_centers_.push_back(data[uni(rng_)]);
for (size_t c = 1; c < k_ + 1; ++c) {
vector<double> distances(n, 0.0);
double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (size_t i = 0; i < n; ++i) {
const auto& seq = data[i];
double dmin = numeric_limits<double>::infinity();
size_t count_inf = 0;
size_t centers = cluster_centers_.size();
for (size_t j = 0; j < centers; ++j) {
const auto& cent = cluster_centers_[j];
DROP buf(cent.len, seq.len, delta_cond);
double dist = buf.compute(tau_, pt_, pe_, sigma_, cent, seq);
if (isinf(dist)) {
++count_inf;
} else {
dmin = min(dmin, dist);
}
}
if (count_inf < centers) {
distances[i] = dmin;
sum += dmin;
}
}
vector<double> prob(n, 0.0);
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < n; ++i) {
prob[i] = distances[i] / sum;
}
discrete_distribution<> rd(prob.begin(), prob.end());
cluster_centers_.push_back(data[rd(rng_)]);
}
bool centers_moved = true;
n_iter_ = 0;
for (size_t iter = 0; iter < max_iter_ && centers_moved; ++iter) {
centers_moved = false;
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < n; ++i) {
const auto& seq = data[i];
double best_dist = numeric_limits<double>::infinity();
int best_cluster = k_;
for (size_t c = 0; c < k_; ++c) {
const auto& cent = cluster_centers_[c];
DROP buf(cent.len, seq.len, delta_cond);
double dist = buf.compute(tau_, pt_, pe_, sigma_, cent, seq);
if (dist < best_dist) {
best_dist = dist;
best_cluster = c;
}
}
labels_[i] = best_cluster;
}
vector<vector<TimedSequence>> clusters(k_ + 1);
vector<TimedSequence> old_centers = cluster_centers_;
for (size_t i = 0; i < n; ++i) {
int c = labels_[i];
if (c >= 0 && c < static_cast<int>(k_)) {
clusters[c].push_back(data[i]);
}
}
#pragma omp parallel for schedule(static)
for (size_t c = 0; c < k_ + 1; ++c) {
if (clusters[c].empty()) {
if (c == k_) {
continue;
} else {
#pragma omp critical
{
uniform_int_distribution<size_t> uni_loc(0, n - 1);
cluster_centers_[c] = data[uni_loc(rng_)];
}
}
} else {
cluster_centers_[c] = dba(clusters[c], barycenter_iter_, tau_, pt_, pe_, sigma_, delta_cond, rng_);
}
}
double max_center_move = 0.0;
#pragma omp parallel for reduction(max : max_center_move) schedule(static)
for (size_t c = 0; c < k_ + 1; ++c) {
DROP buf(cluster_centers_[c].len, old_centers[c].len, delta_cond);
double dist = buf.compute(tau_, pt_, pe_, sigma_, cluster_centers_[c], old_centers[c]);
if (!isinf(dist) && dist > max_center_move) {
max_center_move = dist;
}
}
if (max_center_move > th) {
centers_moved = true;
}
n_iter_ = iter + 1;
}
// cout << "KMeans ran for " << n_iter_ << " iterations (max: "
// << max_iter_ << ")\n";
}
vector<int> KMeans::predict(const vector<TimedSequence>& data, vector<double> delta_cond) {
if (cluster_centers_.empty()) {
throw runtime_error("KMeans::predict called before fit()");
}
const size_t n = data.size();
vector<int> out_labels(n, k_ + 1);
vector<size_t> c_size(k_ + 1, 0);
size_t count = 0;
size_t total = 0;
size_t align = 0;
size_t drop_x = 0;
size_t drop_z = 0;
size_t drop_all = 0;
size_t drop_part = 0;
for (size_t i = 0; i < n; ++i) {
const TimedSequence& seq = data[i];
double best_dist = numeric_limits<double>::infinity();
int best_cluster = k_;
for (size_t c = 0; c < k_; ++c) {
const TimedSequence& cent = cluster_centers_[c];
DROP buf(cent.len, seq.len, delta_cond);
double dist = buf.compute(tau_, pt_, pe_, sigma_, cent, seq);
if (dist < best_dist) {
best_dist = dist;
best_cluster = c;
}
for (size_t p = 0; p < cent.len; ++p) {
for (size_t q = 0; q < seq.len; ++q) {
if (buf.M_[p * seq.len + q] == 1) {
align++;
}
else if (buf.M_[p * seq.len + q] == 2) {
drop_x++;
}
else if (buf.M_[p * seq.len + q] == 3) {
drop_z++;
}
else if (buf.M_[p * seq.len + q] == 4) {
drop_all++;
}
else if (buf.M_[p * seq.len + q] == 5) {
drop_part++;
}
else if (buf.M_[p * seq.len + q] == 0) {
continue;
}
total++;
}
}
}
out_labels[i] = best_cluster;
c_size[out_labels[i]]++;
if (best_cluster == static_cast<int>(k_)) {
count++;
}
}
// Affichage des stats DROP globales à la fin
cout << "\n========== DROP global stats ==========" << endl;
cout << "total : " << total << endl;
cout << "align : " << align << endl;
cout << "drop_x : " << drop_x << endl;
cout << "drop_z : " << drop_z << endl;
cout << "drop_all : " << drop_all << endl;
cout << "drop_part : " << drop_part << endl;
cout << "=====================================\n" << endl;
double sil = silhouette(data, delta_cond);
// for (size_t i = 0; i < k_; ++i) {
// cout << "Cluster " << i << ": " << c_size[i] << endl;
// }
// cout << "Cluster P " << ": " << c_size[k_] << endl;
// cout << "sil = " << sil << endl;
// cout << count << " patients ignored" << endl;
return out_labels;
}
vector<int> KMeans::fit_predict(const vector<TimedSequence>& data, const vector<double>& delta_cond, const double th) {
fit(data, delta_cond, th);
return predict(data, delta_cond);
}
void KMeans::print_cluster() const {
for (size_t c = 0; c < cluster_centers_.size(); ++c) {
const TimedSequence& centro = cluster_centers_[c];
cout << "Centroid " << c << "\n";
centro.print();
cout << "\n";
}
}
vector<double> KMeans::distances(TimedSequence& ts, vector<double> delta_cond) {
if (cluster_centers_.empty()) {
throw runtime_error("KMeans::distances called before fit()");
}
vector<double> dist(k_, 0.0);
for (size_t c = 0; c < k_; ++c) {
const TimedSequence& cent = cluster_centers_[c];
DROP buf(cent.len, ts.len, delta_cond);
dist[c] = buf.compute(tau_, pt_, pe_, sigma_, cent, ts);
}
return dist;
}
double KMeans::score(const vector<TimedSequence>& data, vector<double> delta_cond) {
if (cluster_centers_.empty()) {
throw runtime_error("KMeans::predict called before fit()");
}
const size_t n = data.size();
double inertia = 0.0;
for (size_t i = 0; i < n; ++i) {
const TimedSequence& seq = data[i];
double best_dist = numeric_limits<double>::infinity();
for (size_t c = 0; c < k_; ++c) {
const TimedSequence& cent = cluster_centers_[c];
DROP buf(cent.len, seq.len, delta_cond);
double dist = buf.compute(tau_, pt_, pe_, sigma_, cent, seq);
if (dist < best_dist) {
best_dist = dist;
}
}
if (!isfinite(best_dist)) {
continue;
}
inertia += best_dist * best_dist;
}
return inertia;
}
double KMeans::silhouette(const vector<TimedSequence>& data, vector<double> delta_cond) {
const size_t n = data.size();
double sil = 0.0;
size_t count = 0;
for (size_t i = 0; i < n; ++i) {
int ci = labels_[i];
if (ci == -1) {
continue;
}
const TimedSequence& A = data[i];
double sum_a = 0.0;
double count_a = 0.0;
for (size_t j = 0; j < n; ++j) {
if (j == i || labels_[j] != ci || labels_[j] == -1) {
continue;
}
const TimedSequence& B = data[j];
DROP buf(A.len, B.len, delta_cond);
double dist_a = 0.0;
dist_a = buf.compute(tau_, pt_, pe_, sigma_, A, B);
if (!isfinite(dist_a)) {
continue;
}
sum_a += dist_a;
count_a += 1.0;
}
if (count_a == 0) {
sil += 0.0;
count++;
continue;
}
double a_i = sum_a / count_a;
double b_i = numeric_limits<double>::infinity();
for (size_t oc = 0; oc < k_; ++oc) {
if (static_cast<int>(oc) == ci) {
continue;
}
double sum_b = 0.0;
double count_b = 0.0;
double dist_b = 0.0;
for (size_t j = 0; j < n; ++j) {
if (labels_[j] != static_cast<int>(oc) || labels_[j] == -1) {
continue;
}
const TimedSequence& B = data[j];
DROP buf(A.len, B.len, delta_cond);
dist_b = buf.compute(tau_, pt_, pe_, sigma_, A, B);
if (!isfinite(dist_b)) {
continue;
}
sum_b += dist_b;
count_b += 1.0;
}
if (count_b > 0.0) {
b_i = min(b_i, sum_b / count_b);
}
}
if (!isfinite(b_i)) {
b_i = 0.0;
}
double max_ab = max(a_i, b_i);
if (max_ab > 0) {
double sil_a = (b_i - a_i) / max_ab;
sil += sil_a;
}
count++;
};
return sil / static_cast<double>(count);
}
#include "lifelines.hpp"
#include <boost/math/distributions/chi_squared.hpp>
#include <fstream>
#include <iostream>
#include <map>
#include <set>
#include <stdexcept>
#include <unordered_set>
using namespace std;
inline string trim(string s) {
auto ns = [](unsigned char c) { return !isspace(c); };
s.erase(s.begin(), find_if(s.begin(), s.end(), ns));
s.erase(find_if(s.rbegin(), s.rend(), ns).base(), s.end());
return s;
}
inline string to_lower(string s) {
transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return tolower(c); });
return s;
}
inline bool parse_bool(const string& s_in) {
string s = to_lower(trim(s_in));
if (s == "1" || s == "true" || s == "t" || s == "yes" || s == "y") return true;
if (s == "0" || s == "false" || s == "f" || s == "no" || s == "n") return false;
return (s == "1" || s == "true");
}
inline vector<string> split_simple(const string& line, char sep = ',') {
vector<string> out;
out.reserve(8);
string cur;
for (char c : line) {
if (c == sep) {
out.push_back(cur);
cur.clear();
} else {
cur.push_back(c);
}
}
out.push_back(cur);
return out;
}
inline double safe_stod(const string& s) {
string t = to_lower(trim(s));
if (t.empty() || t == "nan" || t == "na") return numeric_limits<double>::quiet_NaN();
try {
return stod(t);
} catch (...) {
return numeric_limits<double>::quiet_NaN();
}
}
struct Patient {
double time = 0.0;
bool event = false;
int group = 0;
bool seen = false;
};
double lifelines(
const size_t k,
const CsvTable& table,
const vector<int>& labels,
const string& output_csv,
const string& output_png,
bool plot) {
if (table.size() != labels.size())
throw runtime_error("lifelines: taille table et labels différente.");
unordered_map<string, Patient> patients;
patients.reserve(table.size());
for (size_t i = 0; i < table.size(); ++i) {
int cluster = labels[i];
if (cluster < 0 || cluster == static_cast<int>(k)) continue;
auto const& row = table[i];
const string pid = row.at("id_patient");
const double t = stod(row.at("evolution_duration"));
const bool ev = parse_bool(row.at("is_uncensored"));
auto& p = patients[pid];
if (!p.seen) {
p.group = cluster;
p.time = t;
p.event = ev;
p.seen = true;
} else {
if (p.group != cluster) {
throw runtime_error("lifelines: labels incohérents pour le patient " + pid);
}
if (t > p.time) p.time = t;
p.event = p.event || ev;
}
}
if (patients.empty()) {
return numeric_limits<double>::quiet_NaN();
}
unordered_set<int> groups;
groups.reserve(8);
for (const auto& kv : patients) groups.insert(kv.second.group);
const string results_csv = output_csv + ".logrank.csv";
if (groups.size() < 2) {
ofstream r(results_csv);
r << "mode,chi2,p_value,dof\n";
r << "overall,,nan,0\n";
return numeric_limits<double>::quiet_NaN();
}
{
ofstream out(output_csv);
if (!out) throw runtime_error("lifelines: impossible d'ouvrir le CSV en écriture: " + output_csv);
out << "id,time,event,group\n";
out << setprecision(17);
for (const auto& kv : patients) {
const string& pid = kv.first;
const Patient& p = kv.second;
out << pid << "," << p.time << "," << (p.event ? 1 : 0) << "," << p.group << "\n";
}
}
const string logrank_py = "../src/python/evaluate_c.py";
{
ostringstream cmd;
cmd << "python3 " << quoted(logrank_py)
<< " --csv " << quoted(output_csv)
<< " --mode overall"
<< " --out_csv " << quoted(results_csv);
int rc = system(cmd.str().c_str());
if (rc != 0) {
throw runtime_error("lifelines: échec d'exécution du script evaluate_c.py (rc=" + to_string(rc) + ")");
}
}
double p_value = numeric_limits<double>::quiet_NaN();
{
ifstream in(results_csv);
if (!in) throw runtime_error("lifelines: impossible d'ouvrir le résultat: " + results_csv);
string header, line;
if (!getline(in, header) || !getline(in, line))
throw runtime_error("lifelines: résultat logrank vide ou invalide.");
auto cols = split_simple(header, ',');
auto vals = split_simple(line, ',');
auto it = find(cols.begin(), cols.end(), "p_value");
if (it == cols.end()) throw runtime_error("lifelines: colonne p_value absente dans le résultat.");
size_t idx = distance(cols.begin(), it);
string pv = (idx < vals.size() ? vals[idx] : "");
p_value = safe_stod(pv);
}
if (plot) {
const string plot_py = "../src/python/plot_lifelines.py";
ostringstream cmd2;
cmd2 << "python3 " << quoted(plot_py) << " "
<< quoted(output_csv) << " " << quoted(output_png);
(void)system(cmd2.str().c_str());
}
return p_value;
}
#include <omp.h>
#include <cstdio>
#include <bayesopt/bayesopt.hpp>
#include <bayesopt/parameters.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include <chrono>
#include <cstdlib>
#include <ctime>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <numeric>
#include <queue>
#include <sstream>
#include <string>
#include <vector>
#include <cmath>
#include "cah.hpp"
#include "dba.hpp"
#include "distances.hpp"
#include "kmeans.hpp"
#include "lifelines.hpp"
#include "ts.hpp"
#include "utils.hpp"
#include <set>
#include <limits>
#include <algorithm>
#include <cmath>
#include <vector>
#include <string>
#include <fstream>
#include <cstdlib> // pour system()
#include "ts.hpp"
#include <unordered_set>
#include <filesystem>
#include <vector>
#include <algorithm>
namespace fs = std::filesystem;
using namespace std;
using namespace bayesopt;
using vectord = boost::numeric::ublas::vector<double>;
#define NB_META 1
#define REPW 0
#define REPM 1
// Structure pour fusion hiérarchique
struct ClusterNode {
vector<int> indices; // indices des patients
int left = -1; // enfants fusionnés
int right = -1;
double distance = 0.0;
};
// -----------------------------------------------------------------------------
// Phase 1 :
// -----------------------------------------------------------------------------
void phase1(
const std::string& output_dir,
size_t n_epochs,
size_t bo_iterations,
size_t max_iter_kmeans,
size_t max_iter_bary,
const std::string& umap_image_name
)
{
using clock = std::chrono::steady_clock;
auto bo_start = clock::now();
random_device rd;
mt19937 rng(rd());
// Création du dossier de sortie si nécessaire
namespace fs = std::filesystem;
fs::create_directories(output_dir);
// -------------------------------------------------------------------------
// Chargement des données
// -------------------------------------------------------------------------
const vector<string> headers = {"ID_PATIENT", "FEATURE", "TIMESTAMP", "VALUE"};
//CsvTable csv = load_csv("../data/WID_Big_et_small.csv", headers, ',');
CsvTable csv = load_csv("../data/CHU_prepared_global.csv", headers, ',');
//CsvTable csv = load_csv("../data/experimentationsSISAP26/folds/iter5/train.csv", headers, ',');
//CsvTable csv = load_csv("../data/4_patients_2Features.csv", headers, ',');
unordered_map<string, NormParam> params_feat;
NormParam param_time;
auto table = filter_features(csv, {"ALSFRS-R", "WEIGHT", "FVC", "BMI"});
//auto table = filter_features(csv, {"gptinc", "shweal_p99p100", "sptinc_p0p50", "sptinc_p99p100"});
//auto table = filter_features(csv, {"ALSFRS-R", "WEIGHT"});
auto dataset = build_timed_sequences(table, params_feat, param_time, true);
vector<TimedSequence> data;
vector<string> pids;
for (auto const& [pid, ts] : dataset) {
pids.push_back(pid);
data.push_back(ts);
}
if (data.empty()) {
cerr << "Aucune séquence trouvée !" << endl;
return;
}
const size_t nevent = data[0].nevent;
const vector<string> headers_ll = {"ID_PATIENT", "EVOLUTION_DURATION", "IS_UNCENSORED"};
CsvTable table_ll = load_csv("../data/CHU_prepared_survival_data.csv", headers_ll, ',');
string tmp_csv = output_dir + "/tmp_lifelines.csv";
string tmp_png = output_dir + "/tmp_lifelines.png";
// -------------------------------------------------------------------------
// Classe MetricBO
// -------------------------------------------------------------------------
class MetricBO : public ContinuousModel {
public:
struct BORecord {
int iter;
vector<double> x;
double score;
vector<int> labels;
};
vector<BORecord> history_;
MetricBO(const Parameters& p,
const vector<TimedSequence>& data,
const vector<string>& pids,
const CsvTable& table_ll,
const string& tmp_csv,
const string& tmp_png,
size_t dim,
size_t k,
size_t it_km,
size_t it_bary,
mt19937& rng)
: ContinuousModel(dim, p),
data_(data), pids_(pids),
table_ll_(table_ll),
tmp_csv_(tmp_csv), tmp_png_(tmp_png),
k_(k), maxit_km_(it_km),
maxit_bary_(it_bary),
rng_(rng)
{}
double evaluateSample(const vectord& x) override
{
double tau = x(0);
double pt = x(1);
double pe = x(2);
size_t nevent = data_.front().features_name.size();
vector<double> delta_cond(nevent);
for (size_t i = 0; i < nevent; ++i)
delta_cond[i] = x(3 + i);
int sigma = 1;
KMeans m(k_, maxit_km_, maxit_bary_,
tau, pt, pe,
sigma, DROP_DTW, rng_);
auto labels = m.fit_predict(data_, delta_cond, 1e-3);
double score = compute_silhouette(data_, labels,
delta_cond,
tau, pt, pe);
history_.push_back({iter_,
vector<double>(x.begin(), x.end()),
score,
labels});
// Logging
cout << iter_ << "," << tau << "," << pt << "," << pe;
for (double d : delta_cond) cout << "," << d;
cout << "," << score << "\n";
iter_++;
// Sécurité NaN / Inf pour NLOpt
if (std::isnan(score) || std::isinf(score))
return 1e6;
return -score;
}
private:
const vector<TimedSequence>& data_;
const vector<string>& pids_;
const CsvTable& table_ll_;
const string tmp_csv_;
const string tmp_png_;
size_t k_, maxit_km_, maxit_bary_;
mt19937& rng_;
int iter_ = 0;
double compute_silhouette(const vector<TimedSequence>& data,
const vector<int>& labels,
const vector<double>& delta_cond,
double tau, double pt, double pe)
{
size_t n = data.size();
int sigma = 1;
vector<vector<double>> dist_matrix(n, vector<double>(n, 0.0));
#pragma omp parallel for schedule(dynamic)
for (size_t i = 0; i < n; ++i) {
for (size_t j = i; j < n; ++j) {
DROP buf(data[i].len, data[j].len, delta_cond);
double d = buf.compute(tau, pt, pe, sigma, data[i], data[j]);
dist_matrix[i][j] = d;
dist_matrix[j][i] = d;
}
}
vector<double> silhouette(n, 0.0);
std::unordered_set<int> unique_labels(labels.begin(), labels.end());
if (unique_labels.size() < 2) return 1e6; // singleton clusters
#pragma omp parallel for
for (size_t i = 0; i < n; ++i)
{
double a = 0.0;
double b = numeric_limits<double>::max();
int label_i = labels[i];
unordered_map<int, vector<double>> cluster_dists;
for (size_t j = 0; j < n; ++j) if (i != j) cluster_dists[labels[j]].push_back(dist_matrix[i][j]);
if (!cluster_dists[label_i].empty()) {
for (double d : cluster_dists[label_i]) a += d;
a /= cluster_dists[label_i].size();
}
for (auto& [lab, dists] : cluster_dists) {
if (lab == label_i) continue;
double mean_dist = 0.0;
for (double d : dists) mean_dist += d;
mean_dist /= dists.size();
b = min(b, mean_dist);
}
silhouette[i] = (b - a) / max(a, b);
}
double score = 0.0;
for (double s : silhouette) score += s;
return score / n;
}
};
// -------------------------------------------------------------------------
// Paramètres BO et bornes
// -------------------------------------------------------------------------
Parameters bo_params = initialize_parameters_to_default();
bo_params.n_iterations = bo_iterations;
size_t dim = 3 + nevent;
size_t k = 2;
vectord lower(dim), upper(dim);
lower(0) = 0.0;
upper(0) = 1.0;
lower(1) = 0.0; upper(1) = 1.0;
lower(2) = 0.0; upper(2) = 1.0;
for (size_t i = 0; i < nevent; ++i) {
auto const& fname = data.front().features_name[i];
auto const& np = params_feat.at(fname);
lower(3 + i) = 0.0;
upper(3 + i) = 1.0;
}
vector<MetricBO::BORecord> global_history;
vectord global_best_metric(dim);
vector<int> best_global_labels;
double global_best_score = -1e9;
// ===============================
// MULTI EPOCHS
// ===============================
for (size_t epoch = 0; epoch < n_epochs; ++epoch)
{
cout << "epoch," << epoch+1 << "\n";
bo_params.random_seed = rng();
MetricBO optimizer(bo_params, data, pids,
table_ll,
tmp_csv, tmp_png,
dim, k,
max_iter_kmeans,
max_iter_bary,
rng);
optimizer.setBoundingBox(lower, upper);
vectord best_metric_epoch(dim);
optimizer.optimize(best_metric_epoch);
// --- Fichier CSV de l'historique de l'epoch ---
string epoch_history_file = output_dir + "/epoch_" + to_string(epoch+1) + "_history.csv";
ofstream epoch_csv(epoch_history_file);
epoch_csv << "iter,tau,pt,pe";
for (size_t i = 0; i < nevent; ++i) epoch_csv << ",delta" << (i+1);
epoch_csv << ",silhouette\n";
// écrire l'historique de cette epoch et ajouter à l'historique global
for (auto& rec : optimizer.history_)
{
// CSV epoch
epoch_csv << rec.iter << ",";
for (size_t i = 0; i < rec.x.size(); ++i)
{
epoch_csv << rec.x[i];
if (i+1 < rec.x.size()) epoch_csv << ",";
}
epoch_csv << "," << rec.score << "\n";
// ajouter à l'historique global
global_history.push_back(rec);
}
epoch_csv.close();
// --- Mise à jour du meilleur global ---
auto best_it = max_element(
optimizer.history_.begin(),
optimizer.history_.end(),
[](const auto& a, const auto& b){
return a.score < b.score;
});
if (best_it != optimizer.history_.end()
&& best_it->score > global_best_score)
{
global_best_score = best_it->score;
best_global_labels = best_it->labels;
for (size_t i = 0; i < dim; ++i)
global_best_metric(i) = best_it->x[i];
}
}
// ===============================
// FIN DES HISTORIQUES PAR EPOCH
// ===============================
// -------------------------------------------------------------------------
// CSV GLOBAL
// -------------------------------------------------------------------------
string history_path = output_dir + "/history.csv";
ofstream history_csv(history_path);
history_csv << "iter,tau,pt,pe";
for (size_t i = 0; i < nevent; ++i) history_csv << ",delta" << (i+1);
history_csv << ",silhouette\n";
for (auto& rec : global_history) {
history_csv << rec.iter << "," << rec.x[0] << "," << rec.x[1] << "," << rec.x[2];
for (size_t i = 0; i < nevent; ++i) history_csv << "," << rec.x[3 + i];
history_csv << "," << rec.score << "\n";
}
history_csv.close();
// -------------------------------------------------------------------------
// BEST MODEL : UMAP + LIFELINES
// -------------------------------------------------------------------------
vector<double> best_cond(nevent);
for (size_t i = 0; i < nevent; ++i) best_cond[i] = global_best_metric(3 + i);
KMeans bestModel(k, max_iter_kmeans, max_iter_bary,
global_best_metric(0), global_best_metric(1), global_best_metric(2),
1, DROP_DTW, rng);
// ===============================
// UMAP SCRIPT CALL
// ===============================
if (!best_global_labels.empty() && !best_cond.empty() && !data.empty())
{
size_t n = data.size();
int sigma = 1;
vector<vector<double>> dist_matrix(n, vector<double>(n, 0.0));
for (size_t i = 0; i < n; ++i) {
for (size_t j = i; j < n; ++j) {
DROP buf(data[i].len, data[j].len, best_cond);
double d = buf.compute(global_best_metric(0),
global_best_metric(1),
global_best_metric(2),
sigma,
data[i],
data[j]);
dist_matrix[i][j] = d;
dist_matrix[j][i] = d;
}
}
string dist_path = output_dir + "/umap_dist.csv";
string labels_path = output_dir + "/umap_labels.csv";
string ids_path = output_dir + "/umap_ids.csv";
ofstream f_dist(dist_path);
ofstream f_lab(labels_path);
ofstream f_ids(ids_path);
for (size_t i = 0; i < n; ++i) {
for (size_t j = 0; j < n; ++j) {
f_dist << dist_matrix[i][j];
if (j+1 < n) f_dist << ",";
}
f_dist << "\n";
f_lab << best_global_labels[i] << "\n";
f_ids << pids[i] << "\n";
}
f_dist.close();
f_lab.close();
f_ids.close();
// appel du script Python UMAP
string umap_image_path = output_dir + "/" + umap_image_name;
string cmd = "python3 ../python/umap/umapPlot.py "
+ dist_path + " "
+ labels_path + " "
+ ids_path + " "
+ umap_image_path;
cout << "Executing: " << cmd << endl;
int ret = system(cmd.c_str());
if (ret != 0) cerr << "Erreur lors de l'appel du script UMAP\n";
}
else {
cerr << "Erreur: données vides pour UMAP. Vérifiez best_global_labels / best_cond / data." << endl;
}
// Lifelines
// synchroniser labels avec table_ll
vector<int> labels_for_lifelines;
unordered_map<string, size_t> pid_to_idx;
for (size_t i = 0; i < pids.size(); ++i) pid_to_idx[pids[i]] = i;
for (auto& row : table_ll) {
auto it = pid_to_idx.find(row["ID_PATIENT"]);
if (it != pid_to_idx.end())
labels_for_lifelines.push_back(best_global_labels[it->second]);
}
double final_p = lifelines(k, table_ll, labels_for_lifelines,
output_dir + "/lifelines_clusters.csv",
output_dir + "/lifelines_clusters.png",
true);
cout << "BEST_GLOBAL_SILHOUETTE," << global_best_score << "\n";
cout << "FINAL_PVALUE," << final_p << "\n";
// -------------------------------------------------------------------------
// FIN TIMER
// -------------------------------------------------------------------------
auto bo_end = clock::now();
cout << "TOTAL_TIME_SECONDS,"
<< std::chrono::duration<double>(bo_end - bo_start).count()
<< "\n";
}
int main(int argc, char* argv[])
{
if (argc != 8) {
cerr << "Usage: " << argv[0]
<< " <function> <output_dir> <n_epochs> <bo_iterations> <max_iter_kmeans> <max_iter_bary> <umap_image_name>\n";
return 1;
}
string function_name = argv[1];
string output_dir = argv[2];
size_t n_epochs = stoi(argv[3]);
size_t bo_iterations = stoi(argv[4]);
size_t max_iter_kmeans = stoi(argv[5]);
size_t max_iter_bary = stoi(argv[6]);
string umap_image_name = argv[7];
cout << "Starting training with parameters:\n"
<< "Function: " << function_name << "\n"
<< "Output dir: " << output_dir << "\n"
<< "Epochs: " << n_epochs << "\n"
<< "BO iterations: " << bo_iterations << "\n"
<< "KMeans iterations: " << max_iter_kmeans << "\n"
<< "Barycenter iterations: " << max_iter_bary << "\n"
<< "UMAP image: " << umap_image_name << "\n";
if (function_name == "phase1")
{
phase1(
output_dir, n_epochs, bo_iterations,
max_iter_kmeans, max_iter_bary, umap_image_name
);
}
else
{
cerr << "Unknown function: " << function_name << endl;
return 1;
}
return 0;
}
import argparse
import itertools
import numpy as np
import pandas as pd
from lifelines.statistics import logrank_test, multivariate_logrank_test
def ensure_cols(df, cols):
missing = [c for c in cols if c not in df.columns]
if missing:
raise ValueError(f"Colonnes manquantes: {missing}")
def overall(df, time_col, event_col, group_col):
res = multivariate_logrank_test(df[time_col], df[group_col], df[event_col])
return pd.DataFrame([{
"mode": "overall",
"chi2": float(res.test_statistic),
"p_value": float(res.p_value),
"dof": int(len(df[group_col].unique()) - 1)
}])
def k_vs_rest(df, time_col, event_col, group_col, k):
df = df.copy()
df[group_col] = df[group_col].astype(str).str.strip()
k = str(k).strip()
A = df[df[group_col] == k]
B = df[df[group_col] != k]
if len(A) == 0 or len(B) == 0:
return pd.DataFrame([{
"mode": "k_vs_rest",
"k": k,
"n_k": int(len(A)), "n_rest": int(len(B)),
"chi2": np.nan,
"p_value": np.nan
}])
res = logrank_test(A[time_col], B[time_col],
event_observed_A=A[event_col],
event_observed_B=B[event_col])
return pd.DataFrame([{
"mode": "k_vs_rest",
"k": k,
"n_k": int(len(A)), "n_rest": int(len(B)),
"chi2": float(res.test_statistic),
"p_value": float(res.p_value)
}])
def pairwise(df, time_col, event_col, group_col, adjust="holm"):
rows = []
groups = sorted(df[group_col].dropna().unique())
pairs = list(itertools.combinations(groups, 2))
for g1, g2 in pairs:
A = df[df[group_col] == g1]
B = df[df[group_col] == g2]
res = logrank_test(A[time_col], B[time_col], event_observed_A=A[event_col], event_observed_B=B[event_col])
rows.append({"g1": str(g1), "g2": str(g2), "chi2": float(res.test_statistic), "p_raw": float(res.p_value)})
out = pd.DataFrame(rows)
if out.empty:
return out
if adjust:
out = out.sort_values("p_raw").reset_index(drop=True)
m = len(out)
if adjust == "bonferroni":
out["p_adj"] = np.minimum(out["p_raw"] * m, 1.0)
elif adjust == "holm":
out["p_adj"] = [min(max((m - i) * p, 0.0), 1.0) for i, p in enumerate(out["p_raw"])]
out = out.sort_values(["g1","g2"]).reset_index(drop=True)
return out
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--csv", required=True)
ap.add_argument("--time", default="time")
ap.add_argument("--event", default="event")
ap.add_argument("--group", default="group")
ap.add_argument("--mode", choices=["overall","k-vs-rest","pairwise"], default="overall")
ap.add_argument("--k")
ap.add_argument("--out_csv")
args = ap.parse_args()
df = pd.read_csv(args.csv)
ensure_cols(df, [args.time, args.event, args.group])
df = pd.read_csv(args.csv, dtype={args.group: str})
df[args.event] = df[args.event].astype(int)
if args.mode == "overall":
res = overall(df, args.time, args.event, args.group)
elif args.mode == "k-vs-rest":
if args.k is None:
raise SystemExit("--k est requis pour k-vs-rest")
res = k_vs_rest(df, args.time, args.event, args.group, args.k)
else:
res = pairwise(df, args.time, args.event, args.group, adjust="holm")
if args.out_csv:
res.to_csv(args.out_csv, index=False)
else:
print(res.to_csv(index=False))
if __name__ == "__main__":
main()
import argparse
import warnings
# backend non interactif pour l'enregistrement PNG
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from lifelines import KaplanMeierFitter
warnings.filterwarnings("ignore", category=UserWarning)
def ensure_cols(df: pd.DataFrame, cols):
missing = [c for c in cols if c not in df.columns]
if missing:
raise ValueError(f"Colonnes manquantes dans le CSV: {missing}")
def sanitize_df(df: pd.DataFrame, time_col: str, event_col: str, group_col: str) -> pd.DataFrame:
df = df.copy()
df[time_col] = pd.to_numeric(df[time_col], errors="coerce")
df[event_col] = pd.to_numeric(df[event_col], errors="coerce").fillna(0).astype(int)
df[group_col] = df[group_col].astype(str)
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(subset=[time_col, event_col, group_col])
df = df[df[time_col] >= 0]
if "id" in df.columns:
agg = {
time_col: "max",
event_col: lambda x: int((pd.Series(x).astype(int) > 0).any()),
group_col: lambda x: pd.Series(x).iloc[0],
}
df = df.groupby("id", as_index=False).agg(agg)
return df
def plot_km(df: pd.DataFrame, time_col: str, event_col: str, group_col: str, output_png: str):
groups = sorted(df[group_col].unique(), key=lambda x: (x != "k", x))
kmf = KaplanMeierFitter()
fig, ax = plt.subplots(figsize=(10, 6))
for g in groups:
sub = df[df[group_col] == g]
if sub.empty:
continue
n = len(sub)
e = int((sub[event_col].astype(int) > 0).sum())
kmf.fit(durations=sub[time_col].values,
event_observed=sub[event_col].astype(int).values,
label=f"{g} (n={n}, observed events={e})")
kmf.plot_survival_function(ax=ax, ci_show=True)
ax.set_title("Courbes de survie Kaplan–Meier", pad=12)
ax.set_xlabel("Temps")
ax.set_ylabel("S(t)")
ax.grid(True, alpha=0.3)
ax.legend(loc="best", frameon=True)
plt.tight_layout()
fig.savefig(output_png, dpi=300)
plt.close(fig)
def main():
ap = argparse.ArgumentParser(description="Tracer les courbes de survie Kaplan–Meier par groupe.")
ap.add_argument("input_csv", help="Chemin du CSV d'entrée (colonnes: id,time,event,group)")
ap.add_argument("output_png", help="Chemin de l'image PNG de sortie")
ap.add_argument("--time", default="time", help="Nom de la colonne temps (default: time)")
ap.add_argument("--event", default="event", help="Nom de la colonne événement (default: event)")
ap.add_argument("--group", default="group", help="Nom de la colonne groupe (default: group)")
args = ap.parse_args()
df = pd.read_csv(args.input_csv)
ensure_cols(df, ["id", args.time, args.event, args.group])
df = sanitize_df(df, args.time, args.event, args.group)
if df.empty:
raise SystemExit("Aucune donnée exploitable après nettoyage (vérifiez vos colonnes et valeurs).")
plot_km(df, args.time, args.event, args.group, args.output_png)
if __name__ == "__main__":
main()
#include "ts.hpp"
#include <algorithm>
#include <cstring>
#include <fstream>
#include <limits>
#include <numeric>
#include <ranges>
#include <set>
#include <sstream>
#include <stdexcept>
#include <unordered_map>
#include "distances.hpp"
int ts_id_max = 0;
TimedSequence::TimedSequence()
: TimedSequence(ts_id_max, {}, {}, {}, {}) {
}
TimedSequence::TimedSequence(int ident,
const vector<string>& name_t,
const vector<string>& fn,
const vector<double>& e,
const vector<double>& t)
: id(ident),
names(name_t),
features_name(fn),
events(e),
times(t),
len(times.size()),
nevent(events.empty() ? 0 : features_name.size()) {
if (events.empty()) {
throw invalid_argument("TimedSequence: empty events");
} else if (times.empty()) {
throw invalid_argument("TimedSequence: empty times");
}
t_min = times.front();
t_max = times.back();
}
bool TimedSequence::operator==(const TimedSequence& o) const {
if (features_name != o.features_name || len != o.len || nevent != o.nevent) return false;
if (fabs(t_min - o.t_min) > 1e-6 || fabs(t_max - o.t_max) > 1e-6) return false;
if (times.size() != o.times.size()) return false;
for (size_t i = 0; i < times.size(); ++i) {
if (fabs(times[i] - o.times[i]) > 1e-6)
return false;
}
if (events.size() != o.events.size()) return false;
for (size_t i = 0; i < events.size(); ++i) {
if (nevent != o.nevent) return false;
for (size_t j = 0; j < nevent; ++j) {
if (fabs(events[i * nevent + j] - o.events[i * nevent + j]) > 1e-6)
return false;
}
}
return true;
}
bool TimedSequence::operator!=(const TimedSequence& ts) const {
return !(*this == ts);
}
void TimedSequence::print() const {
cerr << "TimedSequence {\n";
cerr << " id = " << id << "\n";
cerr << " nevent = " << nevent
<< " (features: ";
for (size_t k = 0; k < features_name.size(); ++k) {
cerr << features_name[k];
if (k + 1 < features_name.size()) cerr << ", ";
}
cerr << ")\n";
cerr << " len = " << len << "\n";
cerr << " t_min, t_max = "
<< fixed << setprecision(3)
<< t_min << ", " << t_max << "\n";
cerr << " times = [";
for (size_t i = 0; i < times.size(); ++i) {
cerr << fixed << setprecision(3) << times[i];
if (i + 1 < times.size()) cerr << ", ";
}
cerr << "]\n";
cerr << " events matrix (" << len << "×"
<< (events.empty() ? 0 : nevent) << "):\n";
for (size_t i = 0; i < len; ++i) {
cerr << " [";
for (size_t j = 0; j < nevent; ++j) {
cerr << fixed << setprecision(3) << events[i * nevent + j];
if (j + 1 < nevent) cerr << ", ";
}
cerr << "]\n";
}
cerr << "}\n";
}
SequenceMap build_timed_sequences(const CsvTable& table,
unordered_map<string, NormParam>& params_feat,
NormParam& param_time,
bool normalize) {
if (table.empty())
throw runtime_error("build_timed_sequences: table is empty");
const vector<string> required = {
"id_patient", "feature", "timestamp", "value"};
for (auto const& col : required) {
if (table[0].find(col) == table[0].end())
throw runtime_error("Column not found: " + col);
}
map<string, set<double>> patient_times;
map<string, set<string>> patient_feats;
map<string, string> id_to_name;
unordered_map<string, Stats> stats_feat;
Stats stats_time;
for (auto const& row : table) {
const string& pid = row.at("id_patient");
const string& feat = row.at("feature");
double t = stod(row.at("timestamp"));
double v = stod(row.at("value"));
auto& sf = stats_feat[feat];
sf.sum += v;
sf.sum2 += v * v;
sf.min = min(sf.min, v);
sf.max = max(sf.max, v);
sf.count += 1.0;
stats_time.sum += t;
stats_time.sum2 += t * t;
stats_time.min = min(stats_time.min, t);
stats_time.max = max(stats_time.max, t);
stats_time.count += 1.0;
patient_times[pid].insert(t);
patient_feats[pid].insert(feat);
}
vector<string> all_feats;
all_feats.reserve(stats_feat.size());
for (auto const& p : stats_feat)
all_feats.push_back(p.first);
sort(all_feats.begin(), all_feats.end());
unordered_map<string, size_t> idx_feat_global;
idx_feat_global.reserve(all_feats.size());
for (size_t k = 0; k < all_feats.size(); ++k)
idx_feat_global[all_feats[k]] = k;
params_feat.clear();
params_feat.reserve(stats_feat.size());
for (auto const& [feat, sf] : stats_feat) {
NormParam p;
p.mean = sf.sum / sf.count;
double var = (sf.sum2 / sf.count) - (p.mean * p.mean);
p.std = var > 0.0 ? sqrt(var) : 0.0;
p.min = sf.min;
p.max = sf.max;
params_feat[feat] = p;
}
param_time.mean = stats_time.sum / stats_time.count;
{
double var = (stats_time.sum2 / stats_time.count) - (param_time.mean * param_time.mean);
param_time.std = var > 0.0 ? sqrt(var) : 0.0;
}
param_time.min = stats_time.min;
param_time.max = stats_time.max;
SequenceMap result;
for (auto const& [pid, times_set] : patient_times) {
vector<double> raw_times(times_set.begin(), times_set.end());
vector<double> times;
times.reserve(raw_times.size());
for (double rt : raw_times) {
if (normalize) {
/*times.push_back(param_time.std > 0.0
? (rt - param_time.mean) / param_time.std
: 0.0);*/
double range_t = param_time.max - param_time.min;
times.push_back(
range_t > 0.0
? (rt - param_time.min) / range_t
: 0.0
);
} else {
times.push_back(rt);
}
}
unordered_map<double, size_t> idx_time;
idx_time.reserve(raw_times.size());
for (size_t i = 0; i < raw_times.size(); ++i)
idx_time[raw_times[i]] = i;
size_t len = raw_times.size();
size_t nevent = all_feats.size();
vector<double> agg(len * nevent, 0.0);
for (auto const& row : table) {
if (row.at("id_patient") != pid) continue;
double ts = stod(row.at("timestamp"));
size_t i = idx_time[ts];
auto itf = idx_feat_global.find(row.at("feature"));
if (itf == idx_feat_global.end()) continue;
size_t k = itf->second;
agg[i * nevent + k] += stod(row.at("value"));
}
if (normalize) {
for (size_t i = 0; i < len; ++i) {
for (size_t k = 0; k < nevent; ++k) {
const auto& feat = all_feats[k];
const auto& p = params_feat.at(feat);
/*agg[i * nevent + k] = (p.std > 0.0)
? (agg[i * nevent + k] - p.mean) / p.std
: 0.0;*/
double range = p.max - p.min;
agg[i * nevent + k] =
(range > 0.0)
? (agg[i * nevent + k] - p.min) / range
: 0.0;
}
}
}
int id = stoi(pid);
vector<string> name_v;
name_v.push_back(id_to_name[pid]);
result.emplace(
pid,
TimedSequence(id, name_v, all_feats, agg, times));
ts_id_max = max(ts_id_max, id + 1);
}
return result;
}
#include "utils.hpp"
#include <algorithm>
#include <cmath>
#include <fstream>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_set>
double percentile_mtx(const vector<double>& a, const double p) {
if (p < 0.0 || p > 100.0) {
throw invalid_argument("p must be between 0 and 100.");
}
if (a.empty()) {
throw invalid_argument("a must not be empty.");
}
auto values = a;
sort(values.begin(), values.end());
double idx = (p * 0.01) * (static_cast<double>(values.size()) - 1.0);
size_t lower = static_cast<size_t>(floor(idx));
size_t upper = static_cast<size_t>(ceil(idx));
double frac = idx - static_cast<double>(lower);
if (upper >= values.size()) {
upper = values.size() - 1;
}
double result = (1.0 - frac) * values[lower] + frac * values[upper];
return result;
}
double mean(const vector<double>& a) {
size_t N = a.size();
if (N == 0) {
throw runtime_error("mean : vector empty");
}
double sum = accumulate(a.begin(), a.end(), 0.0);
return sum / N;
}
double stddev(const vector<double>& a) {
size_t N = a.size();
if (N < 2) {
return 0.0;
}
double mu = mean(a);
double sq_diff = 0.0;
for (double s : a) {
double d = s - mu;
sq_diff += d * d;
}
return sqrt(sq_diff / (N - 1));
}
void z_norm(vector<double>& a) {
double mu = mean(a);
double sigma = stddev(a);
for (auto& val : a) {
val = (val - mu) / sigma;
}
}
CsvTable load_csv(const string& filename,
const vector<string>& headers,
char delimiter) {
auto normalize_col = [&](string s) {
auto is_space = [](char c) { return isspace(static_cast<unsigned char>(c)); };
size_t start = 0;
while (start < s.size() && is_space(s[start])) {
start++;
}
size_t end = s.size();
while (end > start && is_space(s[end - 1])) {
end--;
}
s = s.substr(start, end - start);
if (s.size() >= 2 && s.front() == '"' && s.back() == '"') {
s = s.substr(1, s.size() - 2);
}
for (auto& c : s) {
c = static_cast<char>(tolower(static_cast<unsigned char>(c)));
}
return s;
};
vector<string> norm_headers;
for (auto h : headers) norm_headers.push_back(normalize_col(h));
ifstream file(filename);
if (!file.is_open())
throw runtime_error("Failed to open file: " + filename);
string line;
if (!getline(file, line))
throw runtime_error("File empty or missing header: " + filename);
vector<string> file_headers;
stringstream hs(line);
string header_cell;
while (getline(hs, header_cell, delimiter)) {
file_headers.push_back(normalize_col(header_cell));
}
for (auto const& req : norm_headers) {
if (find(file_headers.begin(), file_headers.end(), req) == file_headers.end()) {
throw runtime_error("Incorrect header, missing column: " + req);
}
}
CsvTable table;
while (getline(file, line)) {
if (line.empty()) {
continue;
}
stringstream ss(line);
CsvRow row;
string cell;
size_t idx = 0;
while (getline(ss, cell, delimiter) && idx < file_headers.size()) {
string value = cell;
auto is_space = [](char c) {
return isspace(static_cast<unsigned char>(c));
};
size_t a = 0, b = value.size();
while (a < b && is_space(value[a])) {
a++;
}
while (b > a && is_space(value[b - 1])) {
b--;
}
value = value.substr(a, b - a);
if (value.size() >= 2 && value.front() == '"' && value.back() == '"') {
value = value.substr(1, value.size() - 2);
}
if (file_headers[idx] == "feature") {
value = normalize_col(value);
}
row[file_headers[idx]] = value;
++idx;
}
table.emplace_back(move(row));
}
return table;
}
CsvTable filter_features(const CsvTable& table,
const vector<string>& features_to_keep) {
auto normalize_col = [&](string s) {
auto is_space = [](char c) { return isspace(static_cast<unsigned char>(c)); };
size_t start = 0;
while (start < s.size() && is_space(s[start])) {
start++;
}
size_t end = s.size();
while (end > start && is_space(s[end - 1])) {
end--;
}
s = s.substr(start, end - start);
if (s.size() >= 2 && s.front() == '"' && s.back() == '"') {
s = s.substr(1, s.size() - 2);
}
for (auto& c : s) {
c = static_cast<char>(tolower(static_cast<unsigned char>(c)));
}
return s;
};
vector<string> norm_feats;
for (auto h : features_to_keep) norm_feats.push_back(normalize_col(h));
unordered_set<string> keep_set(
norm_feats.begin(), norm_feats.end());
CsvTable filtered;
filtered.reserve(table.size());
for (const auto& row : table) {
auto it = row.find("feature");
if (it != row.end() && keep_set.count(it->second) > 0) {
filtered.push_back(row);
}
}
return filtered;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment