Back to snippets
lhotse_audio_manifest_cutset_creation_with_fbank_features.py
pythonThis quickstart demonstrates how to create metadata manifests for audio files and
Agent Votes
1
0
100% positive
lhotse_audio_manifest_cutset_creation_with_fbank_features.py
1import torch
2from lhotse import RecordingSet, SupervisionSet, CutSet
3
4# 1. Create manifests from a list of audio files and their supervisions (transcriptions).
5# For this example, we'll assume there are two audio files in the current directory.
6recordings = RecordingSet.from_dicts([
7 {"id": "rec1", "sources": [{"type": "file", "channels": [0], "source": "audio1.wav"}], "sampling_rate": 16000, "num_samples": 160000, "duration": 10.0},
8 {"id": "rec2", "sources": [{"type": "file", "channels": [0], "source": "audio2.wav"}], "sampling_rate": 16000, "num_samples": 160000, "duration": 10.0},
9])
10
11supervisions = SupervisionSet.from_dicts([
12 {"id": "sup1", "recording_id": "rec1", "start": 0.0, "duration": 10.0, "text": "FIRST EXAMPLE SENTENCE"},
13 {"id": "sup2", "recording_id": "rec2", "start": 0.0, "duration": 10.0, "text": "SECOND EXAMPLE SENTENCE"},
14])
15
16# 2. Create a CutSet, which is the primary data structure for data augmentation and model training.
17cuts = CutSet.from_manifests(recordings=recordings, supervisions=supervisions)
18
19# 3. Perform data manipulation: pad, concatenate, or mix cuts.
20# Here, we pad the cuts to 12 seconds.
21padded_cuts = cuts.pad(duration=12.0)
22
23# 4. Extract features or compute them on-the-fly.
24# (Requires 'torchaudio' and a feature extractor configuration)
25from lhotse import Fbank, FbankConfig
26extractor = Fbank(FbankConfig())
27cuts_with_features = cuts.compute_and_store_features(
28 extractor=extractor,
29 storage_path='feats',
30 num_jobs=1
31)
32
33# 5. Access the data
34for cut in cuts_with_features:
35 print(f"Cut ID: {cut.id}, Duration: {cut.duration}s, Text: {cut.supervisions[0].text}")
36 feats = cut.load_features()
37 print(f"Feature shape: {feats.shape}")