Commit 05f9e714 authored by Elias's avatar Elias

query preparation part 2

parent 41a5c54f
......@@ -89,6 +89,12 @@ python3 hierarchiqueAvecDerniereDistance.py \
```bash
python3 csv_en_mots.py mots.csv
```
### • This script read the annotation file to find start time and end time for each word given in command, and cut the wav of queries and place them in Train, Val and Test folders
**decouper_wav_mots_uniquement.py :** Output : wav files for all queries in Train, Val and Test folders
```bash
python3 decouper_wav_mots_uniquement.py annotation file "mot1" "mot2" "mot3" ...
```
# Tools
......
import os
import re
import ast
import sys
from textgrid import TextGrid
from pydub import AudioSegment
if len(sys.argv) < 3:
print("Usage: python3 decouper_wav_mots_uniquement.py annotation file mot1 mot2 mot3 ...")
sys.exit(1)
dossier_txt = sys.argv[1]
mots_cibles = [m.lower() for m in sys.argv[2:]] # plusieurs mots possibles
# Dossiers
dossier_wav = 'wav'
dossier_tg = 'textGrid'
dossier_extraits = 'extraits_wav_mots'
with open(dossier_txt, 'r', encoding='utf-8') as f:
contenu = f.read()
bloc_fichiers = {}
fichier_courant = None
for ligne in contenu.splitlines():
ligne = ligne.strip()
if ligne.endswith(".csv"):
fichier_courant = ligne.split()[0]
bloc_fichiers[fichier_courant] = {}
elif fichier_courant:
for split in ["Train", "Val", "Test"]:
if ligne.startswith(f"{split} :"):
match = re.search(rf'{split}\s*:\s*(\[\(.*?\)\])', ligne)
if match:
bloc_fichiers[fichier_courant][split] = match.group(1)
# Traitement des fichiers
for nom_csv, splits in bloc_fichiers.items():
nom_sans_ext = nom_csv.rsplit('.', 1)[0] # ex: 1001-AA
base_nom = nom_sans_ext.split('-')[0] # ex: 1001
locuteur = nom_sans_ext.split('-')[1] if '-' in nom_sans_ext else 'unknown'
wav_path = os.path.join(dossier_wav, f"{base_nom}.wav")
tg_path = os.path.join(dossier_tg, f"{base_nom}.TextGrid")
if not os.path.exists(wav_path):
continue
if not os.path.exists(tg_path):
continue
try:
audio = AudioSegment.from_wav(wav_path)
tg = TextGrid.fromFile(tg_path)
except Exception:
continue
# Choix du tier
if locuteur == "YBA":
word_tier = tg[4]
else:
word_tier = tg[1]
def extraire_mot(data, split_type, mot_cible):
# Nouveau chemin : extraits_wav_mots/mot/Train, Val, Test
dossier_split = os.path.join(dossier_extraits, mot_cible, split_type)
os.makedirs(dossier_split, exist_ok=True)
try:
tuples = ast.literal_eval(data)
except Exception:
return
for xmin_seg, xmax_seg, _ in tuples:
for interval in word_tier.intervals:
mot = interval.mark.strip().lower()
if mot == mot_cible:
if float(xmin_seg) <= interval.minTime and interval.maxTime <= float(xmax_seg):
# Découpe exacte du mot
start_ms = int(interval.minTime * 1000)
end_ms = int(interval.maxTime * 1000)
if end_ms <= start_ms:
continue
extrait = audio[start_ms:end_ms]
# Nouveau format du nom de fichier
extrait_filename = f"{base_nom}-{locuteur}_query_{interval.minTime:.3f}_{interval.maxTime:.3f}.wav"
extrait_path = os.path.join(dossier_split, extrait_filename)
try:
extrait.export(extrait_path, format="wav")
print(extrait_filename) # juste le nom
except Exception:
continue
# Appliquer aux splits
for split in ["Train", "Val", "Test"]:
if split in splits and splits[split] and splits[split] != "[]":
for mot in mots_cibles:
extraire_mot(splits[split], split, mot)
print("\nExtraction terminée...")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment