evaluating similarty with metrics

0025ce78 · Elias · 16400369 · 0025ce78 · 0025ce78 · 0025ce78
Commit 0025ce78 authored Sep 26, 2025 by Elias
Hide whitespace changes
Inline Side-by-side

Showing with 189 additions and 0 deletions

NED.py NED.py +84 -0

README.md README.md +15 -0

confusion.py confusion.py +90 -0

No files found.
--- a/NED.py
+++ b/NED.py
+import argparse
+import csv
+from pathlib import Path
+import pandas as pd
+import numpy as np
+
+def levenshtein(a, b):
+    """Calcul de la distance de Levenshtein entre deux séquences."""
+    n, m = len(a), len(b)
+    if n == 0:
+        return m
+    if m == 0:
+        return n
+    dp = list(range(m + 1))
+    for i in range(1, n + 1):
+        prev = dp[0]
+        dp[0] = i
+        ai = a[i - 1]
+        for j in range(1, m + 1):
+            temp = dp[j]
+            cost = 0 if ai == b[j - 1] else 1
+            dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + cost)
+            prev = temp
+    return dp[m]
+
+def compute_ned_for_file(csv_file):
+    ned_values = []
+
+    with open(csv_file, newline="", encoding="utf-8") as f:
+        reader = list(csv.reader(f))
+        reader = reader[1:]
+
+        for i in range(0, len(reader), 2):
+            if i + 1 >= len(reader):
+                break
+            _, query_phns = reader[i]
+            _, matched_phns = reader[i + 1]
+
+            query_seq = [p for p in query_phns.split("-") if p]
+            matched_seq = [p for p in matched_phns.split("-") if p]
+
+            if not query_seq or not matched_seq:
+                continue
+
+            ed = levenshtein(query_seq, matched_seq)
+            denom = max(len(query_seq), len(matched_seq))
+            ned = ed / denom if denom > 0 else 1.0
+            ned_values.append(ned)
+
+    mean_ned = np.mean(ned_values) if ned_values else 1.0
+    return mean_ned, len(ned_values)
+
+
+def main(validation_dir, output_csv):
+    validation_dir = Path(validation_dir)
+    csv_files = list(validation_dir.glob("*.csv"))
+    if not csv_files:
+        print("Aucun CSV trouvé dans le dossier fourni.")
+        return
+
+    results = []
+
+    for csv_file in csv_files:
+        word = csv_file.stem.split("_")[0]
+        mean_ned, n_pairs = compute_ned_for_file(csv_file)
+        results.append({
+            "word": word,
+            "File": csv_file.name,
+            "Mean_NED": mean_ned,
+            "Num_pairs": n_pairs
+        })
+
+    df = pd.DataFrame(results)
+    df.to_csv(output_csv, index=False)
+    print(f"NED par fichier sauvegardé dans {output_csv}")
+    print(df)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Calcul du NED pour chaque CSV de matches.")
+    parser.add_argument("validation_dir", type=Path, help="Dossier contenant les CSV des mots")
+    parser.add_argument("output_csv", type=Path, help="Fichier CSV pour sauvegarder le NED")
+    args = parser.parse_args()
+
+    main(args.validation_dir, args.output_csv)
--- a/README.md
+++ b/README.md
@@ -129,6 +129,21 @@ python3 matchQueryHS.py /path/to/your/turn/of/speech/npz /path/to/your/queries/n
 python3 transcriptionAfterMatching1_fichiers.py /path/to/your/matched/pairs /output/dir textGrid/
 ```

+### • This script evaluate the similarity with NED metric
+**NED.py :**
+
+```bash
+python3 NED.py /path/to/PhonemesMatching /path/to/output/NED.csv
+```
+
+### • This script evaluate the similarity with Precision and Recall metrics
+**confusion.py :**
+
+```bash
+python3 confusion.py /path/to/PhonemesMatching /path/to/groundTruth.csv --out /path/to/output/precision_recall_TP-FP-FN_F1.csv
+```
+
+
 # Tools

 ### • Counting the number of segments

--- a/confusion.py
+++ b/confusion.py
+import csv
+from pathlib import Path
+from collections import defaultdict
+
+def analyze_matches(match_dir: Path, gt_csv: Path):
+    """
+    Analyse les fichiers CSV de matches pour calculer TP, FP, FN, Precision, Recall et F1.
+    match_dir : dossier contenant les CSV de matches.
+    gt_csv : CSV de référence avec colonnes 'word','nb_occ_dans_chaque_Query'.
+    """
+
+    # Charger le nombre réel d'occurrences (ground truth)
+    real_counts = {}
+    with open(gt_csv, newline='', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            real_counts[row['word']] = int(row['nb_occ_dans_chaque_Query'])
+
+    results = defaultdict(lambda: {'TP': 0, 'FP': 0, 'FN': 0})
+
+    for csv_file in match_dir.glob("*.csv"):
+        word = csv_file.stem.split("_")[0]
+        with open(csv_file, newline='', encoding='utf-8') as f:
+            reader = list(csv.reader(f))
+            reader = reader[1:]
+
+            for i in range(0, len(reader), 2):
+                if i + 1 >= len(reader):
+                    break
+                query_line = reader[i]
+                matched_line = reader[i+1]
+
+                _, query_phns = query_line
+                _, matched_phns = matched_line
+
+                if query_phns == matched_phns:
+                    results[word]['TP'] += 1
+                else:
+                    results[word]['FP'] += 1
+
+        # Calculer FN en utilisant le ground truth
+        if word in real_counts:
+            total_real = real_counts[word]
+            found = results[word]['TP']
+            results[word]['FN'] = max(0, total_real - found)
+
+    # Calculer Precision, Recall, F1
+    output_rows = []
+    for word, counts in results.items():
+        TP = counts['TP']
+        FP = counts['FP']
+        FN = counts['FN']
+        precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
+        recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
+
+        output_rows.append({
+            'word': word,
+            'TP': TP,
+            'FP': FP,
+            'FN': FN,
+            'Precision': round(precision, 3),
+            'Recall': round(recall, 3),
+            'F1': round(f1, 3)
+        })
+
+    return output_rows
+
+def save_results(output_rows, out_csv):
+    with open(out_csv, 'w', newline='', encoding='utf-8') as f:
+        fieldnames = ['word', 'TP', 'FP', 'FN', 'Precision', 'Recall', 'F1']
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in output_rows:
+            writer.writerow(row)
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Calcul TP/FP/FN à partir des matches et ground truth.")
+    parser.add_argument("match_dir", type=Path, help="Dossier contenant les CSV de matches")
+    parser.add_argument("gt_csv", type=Path, help="CSV ground truth avec 'word' et 'nb_occ_dans_chaque_Query'")
+    parser.add_argument("--out", type=Path, default="TP_FP_FN_results.csv", help="CSV de sortie")
+    args = parser.parse_args()
+
+    output_rows = analyze_matches(args.match_dir, args.gt_csv)
+    save_results(output_rows, args.out)
+    print(f"Résultats sauvegardés dans {args.out}")
+    for r in output_rows:
+        print(r)