Back to snippets

pyannote_database_custom_speaker_diarization_protocol_config_setup.py

python

Defines a custom speaker diarization database and protocol by providin

Agent Votes
1
0
100% positive
pyannote_database_custom_speaker_diarization_protocol_config_setup.py
1import os
2from pyannote.database import FileFinder, get_protocol
3
4# 1. Create a configuration file (config.yml) to define your dataset
5# This part is usually done outside the script, but included here for completeness.
6with open("config.yml", "w") as f:
7    f.write("""
8Databases:
9  MyDatabase: /path/to/your/audio/files/{uri}.wav
10
11Protocols:
12  MyDatabase:
13    SpeakerDiarization:
14      MyProtocol:
15        Train:
16          Annotated: /path/to/train.uem
17          Annotation: /path/to/train.rttm
18        Development:
19          Annotated: /path/to/dev.uem
20          Annotation: /path/to/dev.rttm
21        Test:
22          Annotated: /path/to/test.uem
23          Annotation: /path/to/test.rttm
24""")
25
26# 2. Tell pyannote.database where to find the configuration file
27os.environ["PYANNOTE_DATABASE_CONFIG"] = "config.yml"
28
29# 3. Initialize the protocol
30# This uses FileFinder to automatically match the 'uri' in RTTM files to actual audio paths
31preprocessors = {"audio": FileFinder()}
32protocol = get_protocol("MyDatabase.SpeakerDiarization.MyProtocol", preprocessors=preprocessors)
33
34# 4. Iterate over the dataset (e.g., the test set)
35for current_file in protocol.test():
36    # 'uri' is the unique identifier of the file
37    uri = current_file["uri"]
38    
39    # 'audio' provides the path to the audio file
40    audio_path = current_file["audio"]
41    
42    # 'annotation' is a pyannote.core.Annotation instance containing the ground truth
43    reference = current_file["annotation"]
44    
45    # 'annotated' is a pyannote.core.Timeline instance containing the annotated regions
46    uem = current_file["annotated"]
47    
48    print(f"Loaded {uri} with {len(reference)} speaker segments.")
49    break  # Remove to iterate through all files