Commit 0025ce78 authored by Elias's avatar Elias

evaluating similarty with metrics

parent 16400369
import argparse
import csv
from pathlib import Path
import pandas as pd
import numpy as np
def levenshtein(a, b):
"""Calcul de la distance de Levenshtein entre deux séquences."""
n, m = len(a), len(b)
if n == 0:
return m
if m == 0:
return n
dp = list(range(m + 1))
for i in range(1, n + 1):
prev = dp[0]
dp[0] = i
ai = a[i - 1]
for j in range(1, m + 1):
temp = dp[j]
cost = 0 if ai == b[j - 1] else 1
dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev + cost)
prev = temp
return dp[m]
def compute_ned_for_file(csv_file):
ned_values = []
with open(csv_file, newline="", encoding="utf-8") as f:
reader = list(csv.reader(f))
reader = reader[1:]
for i in range(0, len(reader), 2):
if i + 1 >= len(reader):
break
_, query_phns = reader[i]
_, matched_phns = reader[i + 1]
query_seq = [p for p in query_phns.split("-") if p]
matched_seq = [p for p in matched_phns.split("-") if p]
if not query_seq or not matched_seq:
continue
ed = levenshtein(query_seq, matched_seq)
denom = max(len(query_seq), len(matched_seq))
ned = ed / denom if denom > 0 else 1.0
ned_values.append(ned)
mean_ned = np.mean(ned_values) if ned_values else 1.0
return mean_ned, len(ned_values)
def main(validation_dir, output_csv):
validation_dir = Path(validation_dir)
csv_files = list(validation_dir.glob("*.csv"))
if not csv_files:
print("Aucun CSV trouvé dans le dossier fourni.")
return
results = []
for csv_file in csv_files:
word = csv_file.stem.split("_")[0]
mean_ned, n_pairs = compute_ned_for_file(csv_file)
results.append({
"word": word,
"File": csv_file.name,
"Mean_NED": mean_ned,
"Num_pairs": n_pairs
})
df = pd.DataFrame(results)
df.to_csv(output_csv, index=False)
print(f"NED par fichier sauvegardé dans {output_csv}")
print(df)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Calcul du NED pour chaque CSV de matches.")
parser.add_argument("validation_dir", type=Path, help="Dossier contenant les CSV des mots")
parser.add_argument("output_csv", type=Path, help="Fichier CSV pour sauvegarder le NED")
args = parser.parse_args()
main(args.validation_dir, args.output_csv)
......@@ -129,6 +129,21 @@ python3 matchQueryHS.py /path/to/your/turn/of/speech/npz /path/to/your/queries/n
python3 transcriptionAfterMatching1_fichiers.py /path/to/your/matched/pairs /output/dir textGrid/
```
### • This script evaluate the similarity with NED metric
**NED.py :**
```bash
python3 NED.py /path/to/PhonemesMatching /path/to/output/NED.csv
```
### • This script evaluate the similarity with Precision and Recall metrics
**confusion.py :**
```bash
python3 confusion.py /path/to/PhonemesMatching /path/to/groundTruth.csv --out /path/to/output/precision_recall_TP-FP-FN_F1.csv
```
# Tools
### • Counting the number of segments
......
import csv
from pathlib import Path
from collections import defaultdict
def analyze_matches(match_dir: Path, gt_csv: Path):
"""
Analyse les fichiers CSV de matches pour calculer TP, FP, FN, Precision, Recall et F1.
match_dir : dossier contenant les CSV de matches.
gt_csv : CSV de référence avec colonnes 'word','nb_occ_dans_chaque_Query'.
"""
# Charger le nombre réel d'occurrences (ground truth)
real_counts = {}
with open(gt_csv, newline='', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
real_counts[row['word']] = int(row['nb_occ_dans_chaque_Query'])
results = defaultdict(lambda: {'TP': 0, 'FP': 0, 'FN': 0})
for csv_file in match_dir.glob("*.csv"):
word = csv_file.stem.split("_")[0]
with open(csv_file, newline='', encoding='utf-8') as f:
reader = list(csv.reader(f))
reader = reader[1:]
for i in range(0, len(reader), 2):
if i + 1 >= len(reader):
break
query_line = reader[i]
matched_line = reader[i+1]
_, query_phns = query_line
_, matched_phns = matched_line
if query_phns == matched_phns:
results[word]['TP'] += 1
else:
results[word]['FP'] += 1
# Calculer FN en utilisant le ground truth
if word in real_counts:
total_real = real_counts[word]
found = results[word]['TP']
results[word]['FN'] = max(0, total_real - found)
# Calculer Precision, Recall, F1
output_rows = []
for word, counts in results.items():
TP = counts['TP']
FP = counts['FP']
FN = counts['FN']
precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
output_rows.append({
'word': word,
'TP': TP,
'FP': FP,
'FN': FN,
'Precision': round(precision, 3),
'Recall': round(recall, 3),
'F1': round(f1, 3)
})
return output_rows
def save_results(output_rows, out_csv):
with open(out_csv, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['word', 'TP', 'FP', 'FN', 'Precision', 'Recall', 'F1']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in output_rows:
writer.writerow(row)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Calcul TP/FP/FN à partir des matches et ground truth.")
parser.add_argument("match_dir", type=Path, help="Dossier contenant les CSV de matches")
parser.add_argument("gt_csv", type=Path, help="CSV ground truth avec 'word' et 'nb_occ_dans_chaque_Query'")
parser.add_argument("--out", type=Path, default="TP_FP_FN_results.csv", help="CSV de sortie")
args = parser.parse_args()
output_rows = analyze_matches(args.match_dir, args.gt_csv)
save_results(output_rows, args.out)
print(f"Résultats sauvegardés dans {args.out}")
for r in output_rows:
print(r)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment