query preparation part 2

05f9e714 · Elias · 41a5c54f · 05f9e714 · 05f9e714
Commit 05f9e714 authored Sep 01, 2025 by Elias
Hide whitespace changes
Inline Side-by-side

Showing with 112 additions and 0 deletions

README.md README.md +6 -0

decouper_wav_mots_uniquement.py decouper_wav_mots_uniquement.py +106 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -89,6 +89,12 @@ python3 hierarchiqueAvecDerniereDistance.py \
 ```bash
 python3 csv_en_mots.py mots.csv
 ```
+### • This script read the annotation file to find start time and end time for each word given in command, and cut the wav of queries and place them in Train, Val and Test folders
+**decouper_wav_mots_uniquement.py :** Output : wav files for all queries in Train, Val and Test folders 
+
+```bash
+python3 decouper_wav_mots_uniquement.py annotation file "mot1" "mot2" "mot3" ...
+```

 # Tools


--- a/decouper_wav_mots_uniquement.py
+++ b/decouper_wav_mots_uniquement.py
+import os
+import re
+import ast
+import sys
+from textgrid import TextGrid
+from pydub import AudioSegment
+
+if len(sys.argv) < 3:
+    print("Usage: python3 decouper_wav_mots_uniquement.py annotation file mot1 mot2 mot3 ...")
+    sys.exit(1)
+
+dossier_txt = sys.argv[1]
+mots_cibles = [m.lower() for m in sys.argv[2:]]  # plusieurs mots possibles
+
+# Dossiers
+dossier_wav = 'wav'
+dossier_tg = 'textGrid'
+dossier_extraits = 'extraits_wav_mots'
+
+
+with open(dossier_txt, 'r', encoding='utf-8') as f:
+    contenu = f.read()
+
+bloc_fichiers = {}
+fichier_courant = None
+
+for ligne in contenu.splitlines():
+    ligne = ligne.strip()
+    if ligne.endswith(".csv"):
+        fichier_courant = ligne.split()[0]
+        bloc_fichiers[fichier_courant] = {}
+    elif fichier_courant:
+        for split in ["Train", "Val", "Test"]:
+            if ligne.startswith(f"{split} :"):
+                match = re.search(rf'{split}\s*:\s*(\[\(.*?\)\])', ligne)
+                if match:
+                    bloc_fichiers[fichier_courant][split] = match.group(1)
+
+# Traitement des fichiers
+for nom_csv, splits in bloc_fichiers.items():
+    nom_sans_ext = nom_csv.rsplit('.', 1)[0]     # ex: 1001-AA
+    base_nom = nom_sans_ext.split('-')[0]        # ex: 1001
+    locuteur = nom_sans_ext.split('-')[1] if '-' in nom_sans_ext else 'unknown'
+
+    wav_path = os.path.join(dossier_wav, f"{base_nom}.wav")
+    tg_path = os.path.join(dossier_tg, f"{base_nom}.TextGrid")
+
+    if not os.path.exists(wav_path):
+        continue
+    if not os.path.exists(tg_path):
+        continue
+
+    try:
+        audio = AudioSegment.from_wav(wav_path)
+        tg = TextGrid.fromFile(tg_path)
+    except Exception:
+        continue
+
+    # Choix du tier
+    if locuteur == "YBA":
+        word_tier = tg[4]
+    else:
+        word_tier = tg[1]
+
+    def extraire_mot(data, split_type, mot_cible):
+        # Nouveau chemin : extraits_wav_mots/mot/Train, Val, Test
+        dossier_split = os.path.join(dossier_extraits, mot_cible, split_type)
+        os.makedirs(dossier_split, exist_ok=True)
+
+        try:
+            tuples = ast.literal_eval(data)
+        except Exception:
+            return
+
+        for xmin_seg, xmax_seg, _ in tuples:
+            for interval in word_tier.intervals:
+                mot = interval.mark.strip().lower()
+                if mot == mot_cible:
+                    if float(xmin_seg) <= interval.minTime and interval.maxTime <= float(xmax_seg):
+                        # Découpe exacte du mot
+                        start_ms = int(interval.minTime * 1000)
+                        end_ms = int(interval.maxTime * 1000)
+
+                        if end_ms <= start_ms:
+                            continue
+
+                        extrait = audio[start_ms:end_ms]
+
+                        # Nouveau format du nom de fichier
+                        extrait_filename = f"{base_nom}-{locuteur}_query_{interval.minTime:.3f}_{interval.maxTime:.3f}.wav"
+                        extrait_path = os.path.join(dossier_split, extrait_filename)
+
+                        try:
+                            extrait.export(extrait_path, format="wav")
+                            print(extrait_filename)  # juste le nom
+                        except Exception:
+                            continue
+
+    # Appliquer aux splits
+    for split in ["Train", "Val", "Test"]:
+        if split in splits and splits[split] and splits[split] != "[]":
+            for mot in mots_cibles:
+                extraire_mot(splits[split], split, mot)
+
+print("\nExtraction terminée...")
+