count number of occurrences of Train in Train/Val/Test

136f4c35 · Elias · c8d1575a · 136f4c35
Commit 136f4c35 authored Jun 30, 2025 by Elias
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 0 deletions

TrainValTest4_Random.py TrainValTest4_Random.py +60 -0

No files found.
--- a/TrainValTest4_Random.py
+++ b/TrainValTest4_Random.py
+import os
+import re
+import ast
+import csv
+from collections import defaultdict
+
+# Fichiers d'entrée
+fichier_decoupe = "resTrainValTest2_Random_backup.txt"
+fichier_mots = "res3_Random.txt"
+
+# Regex pour trouver les blocs Train / Val / Test
+regex_bloc = {
+    "Train": re.compile(r'Train\s*:\s*(\[\(.*?\)\])', re.DOTALL),
+    "Val": re.compile(r'Val\s*:\s*(\[\(.*?\)\])', re.DOTALL),
+    "Test": re.compile(r'Test\s*:\s*(\[\(.*?\)\])', re.DOTALL),
+}
+
+# Charger les mots cibles (déjà entre guillemets dans le fichier)
+with open(fichier_mots, "r", encoding="utf-8") as f:
+    mots_cibles = set(line.strip().strip('"') for line in f if line.strip())
+
+# Initialisation des compteurs
+compteurs = {
+    "Train": defaultdict(int),
+    "Val": defaultdict(int),
+    "Test": defaultdict(int),
+}
+
+# Lire le fichier contenant les blocs
+with open(fichier_decoupe, "r", encoding="utf-8") as f:
+    contenu = f.read()
+
+# Traitement par split
+for split, regex in regex_bloc.items():
+    blocs = regex.findall(contenu)
+    for bloc in blocs:
+        try:
+            tuples = ast.literal_eval(bloc)
+            for _, _, texte in tuples:
+                # Extraire les mots entre guillemets
+                mots = re.findall(r'"([^"]+)"', texte)
+                for mot in mots:
+                    if mot in mots_cibles:
+                        compteurs[split][mot] += 1
+        except Exception as e:
+            print(f"Erreur parsing {split} : {e}")
+
+# Écriture du fichier CSV
+with open("res4_Random.csv", mode="w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.writer(csvfile)
+    writer.writerow(["Mot", "Train", "Val", "Test"])
+    for mot in sorted(mots_cibles):
+        writer.writerow([
+            mot,
+            compteurs["Train"].get(mot, 0),
+            compteurs["Val"].get(mot, 0),
+            compteurs["Test"].get(mot, 0)
+        ])
+
+print("Fichier 'res4_Random.csv' généré avec succès.")