Commit 552825ec authored by Elias's avatar Elias

hierarchical method with average and complete linkage

parent db87255b
...@@ -64,3 +64,18 @@ python3 frequency_grouping.py ...@@ -64,3 +64,18 @@ python3 frequency_grouping.py
```bash ```bash
python3 graphics.py python3 graphics.py
``` ```
# Clustering methods
## 1. A help to chose the k number for K-Means clustering
### • Method with taking in considar of the distance of the last fusion
**hierarchiqueSansDerniereDistance.py :** This method uses the average and the complete linkage.
```bash
python3 hierarchiqueAvecDerniereDistance.py \
/path/to/your/folder/of/Segmentfeatures \
/path/to/your/folder/of/output_dendrogrammes \
--max_samples 4000 \
--output_name_average dendro_avg.png \
--output_name_complete dendro_complete.png
```
import argparse
from pathlib import Path
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram
def load_segment_features(segment_dir):
files = list(Path(segment_dir).rglob("*_segmentfeature.npy"))
if not files:
raise FileNotFoundError("Pas de fichier *_segmentfeature.npy trouvé")
X = []
for f in tqdm(files, desc=f"Lecture des features depuis {segment_dir}"):
data = np.load(f)
if data.ndim == 1:
continue
elif data.ndim == 2:
X.extend(data)
else:
print(f" Format inattendu")
return np.array(X)
def sample_features(X, max_samples=5000):
"""échantillon aléatoire de max_samples vecteurs."""
if len(X) > max_samples:
idx = np.random.choice(len(X), size=max_samples, replace=False)
return X[idx]
return X
def plot_and_save_dendrogram(X, method, output_path):
"""Construit et sauvegarde un dendrogramme et affiche le k estimé."""
print(f" Calcul de la matrice de liaison avec méthode '{method}'...")
Z = linkage(X, method=method)
distances = Z[:, 2] # distances entre clusters
# Différences entre toutes les fusions (y compris la dernière)
diffs = np.diff(distances)
max_gap_index = np.argmax(diffs)
k_estime = len(distances) - max_gap_index
max_dist = distances[max_gap_index + 1] if max_gap_index + 1 < len(distances) else distances[-1]
print(f"Méthode {method} : Distance max marche = {max_dist:.4f} : k estimé ≈ {k_estime}")
# Sauvegarde du dendrogramme
plt.figure(figsize=(15, 6))
dendrogram(Z, truncate_mode="level", p=20, leaf_rotation=90., leaf_font_size=8.)
plt.title(f"Dendrogramme - Méthode : {method}")
plt.xlabel("Échantillons ou clusters")
plt.ylabel("Distance")
output_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(output_path, dpi=300)
plt.close()
print(f" Dendrogramme sauvegardé : {output_path}")
def main(args):
X = load_segment_features(args.input_dir)
print(f" {len(X)} vecteurs chargés.")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Échantillonnage
X_sampled = sample_features(X_scaled, max_samples=args.max_samples)
print(f" {len(X_sampled)} vecteurs utilisés pour les dendrogrammes.")
# Dendrogramme Average
plot_and_save_dendrogram(X_sampled, method="average", output_path=args.output_dir / args.output_name_average)
# Dendrogramme Complete
plot_and_save_dendrogram(X_sampled, method="complete", output_path=args.output_dir / args.output_name_complete)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Clustering hiérarchique et dendrogramme (average & complete)")
parser.add_argument("input_dir", type=Path, help="Dossier contenant les *_segmentfeature.npy")
parser.add_argument("output_dir", type=Path, help="Dossier pour sauvegarder les dendrogrammes")
parser.add_argument("--max_samples", type=int, default=5000, help="Nombre maximum de vecteurs à utiliser")
parser.add_argument("--output_name_average", type=str, default="dendrogram_average.png", help="Nom du fichier image pour la méthode Average")
parser.add_argument("--output_name_complete", type=str, default="dendrogram_complete.png", help="Nom du fichier image pour la méthode Complete")
args = parser.parse_args()
main(args)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment