delete files renamed

db87255b · Elias · e8de8337 · e8de8337 · e8de8337 · e8de8337
Commit db87255b authored Aug 10, 2025 by Elias
8 changed files
--- a/TrainValTest2_Random.py
+++ b/TrainValTest2_Random.py
-import csv
-import os
-import random
-repertoire = "textGridEnCSV"
-# Fichiers placés directement en val et test
-fichiers_test_direct = {"1023-FA.csv", "1029-HB2.csv"}
-fichiers_val_direct = {"1014-SZ.csv", "1021_bis-TA.csv", "1021-TA.csv"}
-# Stockage
-decoupes = []
-restant = []  # lignes à répartir
-# Stats
-total_lignes_global = 0
-val_direct = []
-test_direct = []
-# Collecter toutes les lignes
-for fichier in os.listdir(repertoire):
-    if not fichier.endswith(".csv"):
-        continue
-    chemin = os.path.join(repertoire, fichier)
-    with open(chemin, mode="r", newline="") as f:
-        reader = csv.reader(f)
-        next(reader)  # pour ignorer l'en-tête
-        lignes = [row for row in reader if row[2].strip()]
-    total = len(lignes)
-    total_lignes_global += total
-    lignes_bounds = [(float(row[0]), float(row[1]), row[2].strip()) for row in lignes]
-    if fichier in fichiers_test_direct:
-        decoupes.append({
-            "fichier": fichier,
-            "Train": [],
-            "Val": [],
-            "Test": lignes_bounds
-        })
-        test_direct.extend(lignes_bounds)
-    elif fichier in fichiers_val_direct:
-        decoupes.append({
-            "fichier": fichier,
-            "Train": [],
-            "Val": lignes_bounds,
-            "Test": []
-        })
-        val_direct.extend(lignes_bounds)
-    else:
-        # À répartir plus tard dans le global
-        restant.append((fichier, lignes_bounds))
-# Constitution du global
-all_restant = []
-fichier_to_rows = {}
-for fichier, lignes in restant:
-    for ligne in lignes:
-        all_restant.append((fichier, ligne))
-    fichier_to_rows[fichier] = []
-# Mélange et découpage global
-random.shuffle(all_restant)
-reste = total_lignes_global - len(val_direct) - len(test_direct)
-n_val_cible = int(total_lignes_global * 0.10) - len(val_direct)
-n_test_cible = int(total_lignes_global * 0.10) - len(test_direct)
-n_train_cible = reste - n_val_cible - n_test_cible
-train, val, test = [], [], []
-for fichier, ligne in all_restant:
-    if len(train) < n_train_cible:
-        train.append((fichier, ligne))
-    elif len(val) < n_val_cible:
-        val.append((fichier, ligne))
-    else:
-        test.append((fichier, ligne))
-# Regrouper par fichier
-fichier_to_bounds = {fichier: {"Train": [], "Val": [], "Test": []} for fichier in fichier_to_rows}
-for fichier, ligne in train:
-    fichier_to_bounds[fichier]["Train"].append(ligne)
-for fichier, ligne in val:
-    fichier_to_bounds[fichier]["Val"].append(ligne)
-for fichier, ligne in test:
-    fichier_to_bounds[fichier]["Test"].append(ligne)
-# Ajouter à la sortie finale
-for fichier in fichier_to_bounds:
-    decoupes.append({
-        "fichier": fichier,
-        "Train": fichier_to_bounds[fichier]["Train"],
-        "Val": fichier_to_bounds[fichier]["Val"],
-        "Test": fichier_to_bounds[fichier]["Test"],
-    })
-# Affichage
-for d in decoupes:
-    print(f"\n{d['fichier']}")
-    print("  Train :", d["Train"])
-    print("  Val   :", d["Val"])
-    print("  Test  :", d["Test"])
-# Statistiques
-nb_train = sum(len(d["Train"]) for d in decoupes)
-nb_val = sum(len(d["Val"]) for d in decoupes)
-nb_test = sum(len(d["Test"]) for d in decoupes)
-print("\n--- STATISTIQUES ---")
-print(f"Total lignes       : {total_lignes_global}")
-print(f"Train : {nb_train} lignes ({nb_train / total_lignes_global * 100:.2f}%)")
-print(f"Val   : {nb_val} lignes ({nb_val / total_lignes_global * 100:.2f}%)")
-print(f"Test  : {nb_test} lignes ({nb_test / total_lignes_global * 100:.2f}%)")
--- a/TrainValTest3_Random.py
+++ b/TrainValTest3_Random.py
-import re
-import ast
-import os
-from pympi import Praat
-dossier_textgrid = 'textGrid'
-fichier_txt = 'resTrainValTest2_Random.txt'
-fichier_txt_backup = 'resTrainValTest2_Random_backup.txt'
-output_mots = set()
-# Sauvegarder le fichier original
-with open(fichier_txt, 'r', encoding='utf-8') as f:
-    contenu_original = f.read()
-with open(fichier_txt_backup, 'w', encoding='utf-8') as f_backup:
-    f_backup.write(contenu_original)
-# Regex pour trouver chaque bloc .csv + Train : [(xmin, xmax, texte)]
-regex_bloc = re.compile(r'(?P<nom_csv>\S+\.csv)\s*Train\s*:\s*(?P<train_data>\[\(.*?\)\])', re.DOTALL)
-contenu_modifie = contenu_original  # On travaille sur une copie
-# Parcourir tous les blocs Train
-for match in regex_bloc.finditer(contenu_original):
-    nom_csv = match.group('nom_csv')
-    train_data_str = match.group('train_data')
-    base_name = nom_csv.split('-')[0]
-    textgrid_path = os.path.join(dossier_textgrid, f"{base_name}.TextGrid")
-    if not os.path.exists(textgrid_path):
-        print(f"Fichier TextGrid manquant : {textgrid_path}")
-        continue
-    # Charger TextGrid
-    tg = Praat.TextGrid(textgrid_path)
-    try:
-        phrases = ast.literal_eval(train_data_str)
-    except Exception as e:
-        print(f"Erreur Train pour {nom_csv} : {e}")
-        continue
-    # Accès aux tiers locuteurs
-    tier_locuteur1 = tg.tiers[1].intervals
-    tier_locuteur2 = tg.tiers[4].intervals
-    nouvelles_phrases = []
-    for xmin_phrase, xmax_phrase, _ in phrases:
-        mots = []
-        def extraire_mots(intervals):
-            for xmin_mot, xmax_mot, mot in intervals:
-                if xmin_mot >= xmin_phrase and xmax_mot <= xmax_phrase:
-                    mot = mot.strip()
-                    if mot:
-                        mots.append(f'"{mot}"')
-                        output_mots.add(mot)
-        extraire_mots(tier_locuteur1)
-        extraire_mots(tier_locuteur2)
-        nouvelle_chaine = " ".join(mots)
-        nouvelles_phrases.append((xmin_phrase, xmax_phrase, nouvelle_chaine))
-    nouveau_bloc = f"{nom_csv} Train : {repr(nouvelles_phrases)}"
-    contenu_modifie = contenu_modifie.replace(match.group(0), nouveau_bloc)
-# Réécrire le fichier texte modifié
-with open(fichier_txt, 'w', encoding='utf-8') as f_modif:
-    f_modif.write(contenu_modifie)
-# Écrire les mots extraits dans res3_Random.txt
-with open('res3_Random.txt', 'w', encoding='utf-8') as f_mots:
-    for mot in sorted(output_mots):
-        f_mots.write(f'"{mot}"\n')
-print("Fichier mis à jour :", fichier_txt)
-print("Sauvegarde créée :", fichier_txt_backup)
-print("Mots extraits dans res3_Random.txt")
--- a/TrainValTest3_bis_Random.py
+++ b/TrainValTest3_bis_Random.py
-import re
-import ast
-import os
-from pympi import Praat
-dossier_textgrid = 'textGrid'
-fichier_txt = 'resTrainValTest2_Random_backup.txt'
-output_mots = set()
-with open(fichier_txt, 'r', encoding='utf-8') as f:
-    contenu_original = f.read()
-# Regex pour trouver chaque bloc .csv + Train : [(xmin, xmax, texte)]
-pattern_fichier_blocs = re.compile(
-    r'(?P<filename>\S+\.csv)\s*?\n\s*Train\s*:\s*(?P<train>\[.*?\])\s*?\n\s*Val\s*:\s*(?P<val>\[.*?\])\s*?\n\s*Test\s*:\s*(?P<test>\[.*?\])',
-    re.DOTALL
-)
-contenu_modifie = contenu_original
-for match in pattern_fichier_blocs.finditer(contenu_original):
-    filename = match.group('filename')
-    blocs = {
-        "Train": match.group('train'),
-        "Val": match.group('val'),
-        "Test": match.group('test'),
-    }
-    base_name = filename.split('-')[0]
-    textgrid_path = os.path.join(dossier_textgrid, f"{base_name}.TextGrid")
-    if not os.path.exists(textgrid_path):
-        print(f"Fichier TextGrid manquant : {textgrid_path}")
-        continue
-    tg = Praat.TextGrid(textgrid_path)
-    tier_locuteur1 = tg.tiers[1].intervals
-    tier_locuteur2 = tg.tiers[4].intervals
-    nouvelles_lignes = [filename]
-    for split in ["Train", "Val", "Test"]:
-        try:
-            phrases = ast.literal_eval(blocs[split])
-        except Exception as e:
-            print(f"Erreur {split} pour {filename} : {e}")
-            nouvelles_lignes.append(f"{split} : {blocs[split]}")
-            continue
-        nouvelles_phrases = []
-        for xmin_phrase, xmax_phrase, _ in phrases:
-            mots = []
-            def extraire_mots(intervals):
-                for xmin_mot, xmax_mot, mot in intervals:
-                    if xmin_mot >= xmin_phrase and xmax_mot <= xmax_phrase:
-                        mot = mot.strip()
-                        if mot:
-                            mots.append(f'"{mot}"')
-                            output_mots.add(mot)
-            extraire_mots(tier_locuteur1)
-            extraire_mots(tier_locuteur2)
-            nouvelle_chaine = " ".join(mots)
-            nouvelles_phrases.append((xmin_phrase, xmax_phrase, nouvelle_chaine))
-        nouvelles_lignes.append(f"{split} : {repr(nouvelles_phrases)}")
-    # Remplacer bloc entier dans le texte original
-    bloc_original = match.group(0)
-    bloc_nouveau = "\n  ".join(nouvelles_lignes)
-    contenu_modifie = contenu_modifie.replace(bloc_original, bloc_nouveau)
-# Écriture du fichier modifié
-with open(fichier_txt, 'w', encoding='utf-8') as f_out:
-    f_out.write(contenu_modifie)
-# Enregistrer tous les mots extraits
-with open('res3_Random.txt', 'w', encoding='utf-8') as f_mots:
-    for mot in sorted(output_mots):
-        f_mots.write(f'"{mot}"\n')
-print("Traitement terminé pour Train, Val et Test.")
--- a/TrainValTest4_Random.py
+++ b/TrainValTest4_Random.py
-import os
-import re
-import ast
-import csv
-from collections import defaultdict
-# Fichiers d'entrée
-fichier_decoupe = "resTrainValTest2_Random_backup.txt"
-fichier_mots = "res3_Random.txt"
-# Regex pour trouver les blocs Train / Val / Test
-regex_bloc = {
-    "Train": re.compile(r'Train\s*:\s*(\[\(.*?\)\])', re.DOTALL),
-    "Val": re.compile(r'Val\s*:\s*(\[\(.*?\)\])', re.DOTALL),
-    "Test": re.compile(r'Test\s*:\s*(\[\(.*?\)\])', re.DOTALL),
-}
-# Charger les mots cibles (déjà entre guillemets dans le fichier)
-with open(fichier_mots, "r", encoding="utf-8") as f:
-    mots_cibles = set(line.strip().strip('"') for line in f if line.strip())
-# Initialisation des compteurs
-compteurs = {
-    "Train": defaultdict(int),
-    "Val": defaultdict(int),
-    "Test": defaultdict(int),
-}
-# Lire le fichier contenant les blocs
-with open(fichier_decoupe, "r", encoding="utf-8") as f:
-    contenu = f.read()
-# Traitement par split
-for split, regex in regex_bloc.items():
-    blocs = regex.findall(contenu)
-    for bloc in blocs:
-        try:
-            tuples = ast.literal_eval(bloc)
-            for _, _, texte in tuples:
-                # Extraire les mots entre guillemets
-                mots = re.findall(r'"([^"]+)"', texte)
-                for mot in mots:
-                    if mot in mots_cibles:
-                        compteurs[split][mot] += 1
-        except Exception as e:
-            print(f"Erreur parsing {split} : {e}")
-# Écriture du fichier CSV
-with open("res4_Random.csv", mode="w", newline="", encoding="utf-8") as csvfile:
-    writer = csv.writer(csvfile)
-    writer.writerow(["Mot", "Train", "Val", "Test"])
-    for mot in sorted(mots_cibles):
-        writer.writerow([
-            mot,
-            compteurs["Train"].get(mot, 0),
-            compteurs["Val"].get(mot, 0),
-            compteurs["Test"].get(mot, 0)
-        ])
-print("Fichier 'res4_Random.csv' généré avec succès.")
--- a/TrainValTest5_Random.py
+++ b/TrainValTest5_Random.py
-import csv
-fichier_entree = "res4_Random.csv" # fichier de la forme Mot,Train,Val,Test
-fichier_sortie = "res5_Random.csv" # fichier de sortie : pareil mais trié en ordre décroissant de nombre d'occurences dans la colonne Train
-donnees = []
-# Lire le fichier CSV d'entrée
-with open(fichier_entree, mode="r", encoding="utf-8") as f:
-    reader = csv.DictReader(f)
-    for row in reader:
-        mot = row["Mot"]
-        train = int(row["Train"])
-        val = int(row["Val"])
-        test = int(row["Test"])
-        donnees.append([mot, train, val, test])
-# Trier par ordre décroissant sur la colonne Train
-donnees_tries = sorted(donnees, key=lambda x: x[1], reverse=True)
-# Écrire dans un nouveau fichier CSV
-with open(fichier_sortie, mode="w", newline="", encoding="utf-8") as f_out:
-    writer = csv.writer(f_out)
-    writer.writerow(["Mot", "Train", "Val", "Test"])
-    writer.writerows(donnees_tries)
-# Afficher la somme des occurrences Train
-somme_train = sum(x[1] for x in donnees_tries)
-print(f"Fichier '{fichier_sortie}' généré avec succès.")
-print(f"Somme totale des occurrences dans Train : {somme_train}")
--- a/TrainValTest6_Random.py
+++ b/TrainValTest6_Random.py
-import csv
-input_file = "res5_Random.csv"
-output_file = "res6_Random.csv"
-with open(input_file, mode="r", newline="") as f_in, open(output_file, mode="w", newline="") as f_out:
-    reader = csv.reader(f_in)
-    writer = csv.writer(f_out)
-    header = next(reader)
-    writer.writerow(header)  # écrire l'en-tête
-    for row in reader:
-        mot, train, val, test = row
-        if int(train) >= 2 and int(val) >= 2 and int(test) >= 2:
-            writer.writerow(row)
-print(f"Fichier filtré sauvegardé dans '{output_file}'")
--- a/TrainValTest7_Random.py
+++ b/TrainValTest7_Random.py
-import csv
-fichier_entree = "res6_Random.csv" # fichier sans colonne Bin
-fichier_sortie = "res7_Random.csv" # fichier qui sera extrait avec la colonne Bin
-donnees = []
-# Lire les données
-with open(fichier_entree, mode="r", encoding="utf-8") as f:
-    reader = csv.DictReader(f)
-    for row in reader:
-        mot = row["Mot"]
-        train = int(row["Train"])
-        val = int(row["Val"])
-        test = int(row["Test"])
-        donnees.append([mot, train, val, test])
-donnees_tries = sorted(donnees, key=lambda x: x[1], reverse=True)
-# Calculer la somme totale
-somme_train = sum(row[1] for row in donnees_tries)
-print(f"Somme totale de Train : {somme_train}")
-# Calcul des bins
-cumul = 0
-donnees_binees = []
-for row in donnees_tries:
-    mot, train, val, test = row
-    cumul += train
-    pourcentage = (cumul / somme_train) * 100
-    bin_pourcent = min(100, ((int(pourcentage) // 10) + 1) * 10)  # 10, 20, ..., 100
-    donnees_binees.append([mot, train, val, test, bin_pourcent])
-# Sauvegarder dans un nouveau fichier
-with open(fichier_sortie, mode="w", newline="", encoding="utf-8") as f_out:
-    writer = csv.writer(f_out)
-    writer.writerow(["Mot", "Train", "Val", "Test", "Bin"])  # Ajout de la colonne Bin
-    writer.writerows(donnees_binees)
-print(f"Fichier '{fichier_sortie}' généré avec succès.")
--- a/TrainValTest8_Random.py
+++ b/TrainValTest8_Random.py
-import csv
-from collections import defaultdict
-import matplotlib.pyplot as plt
-input_file = "res7_Random.csv"
-# Dictionnaire pour compter les lignes par Bin
-bin_counts = defaultdict(int)
-with open(input_file, mode="r", newline="") as f:
-    reader = csv.reader(f)
-    next(reader) 
-    for row in reader:
-        bin_value = int(row[4])  # colonne 'Bin'
-        bin_counts[bin_value] += 1
-# Trier les bins
-bins = sorted(bin_counts.keys())
-counts = [bin_counts[b] for b in bins]
-# Création du bar chart
-plt.figure(figsize=(10, 6))
-bars = plt.bar([str(b) for b in bins], counts, color='skyblue')
-# Ajouter les nombres au-dessus des barres
-for bar in bars:
-    height = bar.get_height()
-    plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, str(height),
-             ha='center', va='bottom', fontsize=10)
-plt.title("Number of words by Bin")
-plt.xlabel("Bin")
-plt.ylabel("Number of words")
-plt.grid(axis='y', linestyle='--', alpha=0.7)
-plt.tight_layout()
-plt.show()