Commit 7ca451f3 authored by Elias's avatar Elias

kmeans train and validation

parent 4b1fb000
......@@ -77,8 +77,15 @@ python3 TopMotsParBin.py
```
# Clustering methods
## 1. K-Means clustering
### • Script (inspired by [the github of the paper : SD-HuBERT: Sentence-Level Self-Distillation Induces Syllabic Organization in HuBERT](https://github.com/cheoljun95/sdhubert))
**km.py :** Here is the training and validation with K-Means on différents k.
## 1. A help to chose the k number for K-Means clustering
```bash
python3 km.py --train_file files/filesTrainWAV.txt --val_file files/filesValWAV.txt --train_dir /path/to/your/SegmentfeaturesTrain/ --val_dir /path/to/your/SegmentfeaturesVal/ --k_list 100,150,200 --save_models saved_models/
```
## 2. A help to chose the k number for K-Means clustering
### • Method with taking in considar of the distance of the last fusion
**hierarchiqueSansDerniereDistance.py :** This method uses the average and the complete linkage.
......
import argparse
import numpy as np
from pathlib import Path
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import joblib
from tqdm import tqdm
def load_features(file_list, segment_dir):
features = []
for file in tqdm(file_list, desc=f"Chargement depuis {segment_dir}"):
segment_path = Path(segment_dir) / f"{Path(file).stem}_segmentfeature.npy"
if segment_path.exists():
data = np.load(segment_path)
if data.ndim == 1:
features.append(data)
else:
features.extend(data)
else:
print(f" Fichier manquant : {segment_path}")
return np.stack(features)
def main(args):
with open(args.train_file, 'r') as f:
train_files = [line.strip() for line in f]
with open(args.val_file, 'r') as f:
val_files = [line.strip() for line in f]
print(" Chargement des features d'entraînement...")
X_train = load_features(train_files, args.train_dir)
print(" Chargement des features de validation...")
X_val = load_features(val_files, args.val_dir)
results = []
k_range = [int(k) for k in args.k_list.split(",")]
if args.save_models:
model_dir = Path(args.save_models)
model_dir.mkdir(parents=True, exist_ok=True)
for k in k_range:
print(f"\n Entraînement KMeans (k={k})...")
km = MiniBatchKMeans(n_clusters=k, batch_size=10000, max_iter=100, n_init=5, random_state=42)
km.fit(X_train)
print(" Prédiction sur Val...")
y_pred = km.predict(X_val)
print(" Évaluation...")
wcss = km.inertia_
silhouette = silhouette_score(X_val, y_pred)
db_index = davies_bouldin_score(X_val, y_pred)
print(f" k={k} | WCSS={wcss:.2f} | Silhouette={silhouette:.4f} | Davies-Bouldin={db_index:.4f}")
results.append((k, wcss, silhouette, db_index))
if args.save_models:
model_path = model_dir / f"kmeans_k{k}.pt"
joblib.dump(km, model_path)
print(f" Modèle sauvegardé dans {model_path}")
print("\n Résumé des résultats :")
print(f"{'k':>5} | {'WCSS':>12} | {'Silhouette':>10} | {'Davies-Bouldin':>15}")
for k, wcss, sil, db in results:
print(f"{k:5d} | {wcss:12.2f} | {sil:10.4f} | {db:15.4f}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train_file", type=str, required=True, help="Fichier texte avec la liste des fichiers d'entraînement (une ligne par fichier .wav)")
parser.add_argument("--val_file", type=str, required=True, help="Fichier texte avec la liste des fichiers de validation (une ligne par fichier .wav)")
parser.add_argument("--train_dir", type=str, required=True, help="Répertoire contenant les .npy d'entraînement")
parser.add_argument("--val_dir", type=str, required=True, help="Répertoire contenant les .npy de validation")
parser.add_argument("--k_list", type=str, required=True, help="Liste des k à tester, ex: 5,10,20")
parser.add_argument("--save_models", type=str, default=None, help="Répertoire pour sauvegarder les modèles KMeans")
args = parser.parse_args()
main(args)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment