From 6e74865573df6c70efe19c04ba88a461e5a884c5 Mon Sep 17 00:00:00 2001 From: Blinorot Date: Sat, 19 Nov 2022 23:50:01 +0300 Subject: [PATCH 1/3] Russia Datasets Added --- hw_asr/datasets/__init__.py | 6 +- hw_asr/datasets/ljspeech_dataset.py | 2 +- hw_asr/datasets/ru_commonvoice_dataset.py | 126 ++++++++++++++++++++++ hw_asr/datasets/ru_golos_dataset.py | 124 +++++++++++++++++++++ 4 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 hw_asr/datasets/ru_commonvoice_dataset.py create mode 100644 hw_asr/datasets/ru_golos_dataset.py diff --git a/hw_asr/datasets/__init__.py b/hw_asr/datasets/__init__.py index 2644f29..dc65069 100644 --- a/hw_asr/datasets/__init__.py +++ b/hw_asr/datasets/__init__.py @@ -2,10 +2,14 @@ from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset from hw_asr.datasets.librispeech_dataset import LibrispeechDataset from hw_asr.datasets.ljspeech_dataset import LJspeechDataset +from hw_asr.datasets.ru_commonvoice_dataset import RuCommonVoiceDataset +from hw_asr.datasets.ru_golos_dataset import GolosDataset __all__ = [ "LibrispeechDataset", "CustomDirAudioDataset", "CustomAudioDataset", - "LJspeechDataset" + "LJspeechDataset", + "RuCommonVoiceDataset", + "GolosDataset" ] diff --git a/hw_asr/datasets/ljspeech_dataset.py b/hw_asr/datasets/ljspeech_dataset.py index adf9430..bc8f575 100644 --- a/hw_asr/datasets/ljspeech_dataset.py +++ b/hw_asr/datasets/ljspeech_dataset.py @@ -81,7 +81,7 @@ def _create_index(self, part): w_id = line.split('|')[0] w_text = " ".join(line.split('|')[1:]).strip() wav_path = wav_dir / f"{w_id}.wav" - if not wav_path.exists(): # elem in another part + if not wav_path.exists(): # elem is in another part continue t_info = torchaudio.info(str(wav_path)) length = t_info.num_frames / t_info.sample_rate diff --git a/hw_asr/datasets/ru_commonvoice_dataset.py b/hw_asr/datasets/ru_commonvoice_dataset.py new file mode 100644 index 0000000..8119d7d --- /dev/null +++ b/hw_asr/datasets/ru_commonvoice_dataset.py @@ -0,0 +1,126 @@ +import concurrent.futures as cf +import json +import logging +import os +import shutil +from asyncio import as_completed +from pathlib import Path + +import pandas as pd +import torch +import torchaudio +from hw_asr.base.base_dataset import BaseDataset +from hw_asr.utils import ROOT_PATH +from speechbrain.utils.data_utils import download_file +from tqdm import tqdm + +logger = logging.getLogger(__name__) + +class RuCommonVoiceDataset(BaseDataset): + def __init__(self, part, data_dir=None, use_vad=False, *args, **kwargs): + """ + :param part: which part of dataset to use + :param data_dir: Path objecth with the path to data folder + :param use_vad: whether to preprocess all audios with Voice Activity Detector + in order to cut silence at the beggining and end of the audio + """ + if data_dir is None: + data_dir = ROOT_PATH / "data" / "datasets" / "ru_commonvoice" + data_dir.mkdir(exist_ok=True, parents=True) + self._data_dir = data_dir + index = self._get_or_load_index(part, use_vad) + + super().__init__(index, *args, **kwargs) + + def _load_part(self, part): + df = pd.read_csv(str(self._data_dir / f'{part}.tsv'), sep='\t') + for _, row in df.iterrows(): + f_name = row['path'] + file_path = self._data_dir / 'clips' / f_name + shutil.move(str(file_path), str(self._data_dir / part / f_name)) + + def _load_dataset(self): + arch_path = self._data_dir / "cv-corpus-11.0-2022-09-21-ru.tar.gz" + + # url wget is not supported due to email confirmation needed + assert arch_path.exists(), "please download RU Common Voice 11.0 from the official website" + print(f"Loading RU Common Voice 11.0") + + shutil.unpack_archive(arch_path, self._data_dir) + for fpath in (self._data_dir / "cv-corpus-11.0-2022-09-21/ru").iterdir(): + shutil.move(str(fpath), str(self._data_dir / fpath.name)) + os.remove(str(arch_path)) + shutil.rmtree(str(self._data_dir / "cv-corpus-11.0-2022-09-21")) + + (self._data_dir / "train").mkdir(exist_ok=True, parents=True) + (self._data_dir / "dev").mkdir(exist_ok=True, parents=True) + (self._data_dir / "test").mkdir(exist_ok=True, parents=True) + + self._load_part("train") + self._load_part("dev") + self._load_part("test") + + shutil.rmtree(str(self._data_dir / "clips")) + + + def _get_or_load_index(self, part, use_vad): + if use_vad: + index_path = self._data_dir / f"{part}_vad_index.json" + else: + index_path = self._data_dir / f"{part}_index.json" + if index_path.exists(): + with index_path.open() as f: + index = json.load(f) + else: + index = self._create_index(part, use_vad) + with index_path.open("w") as f: + json.dump(index, f, indent=2) + return index + + def _create_index(self, part, use_vad): + index = [] + split_dir = self._data_dir / part + if not split_dir.exists(): + self._load_dataset() + + mp3_dirs = set() + for dirpath, dirnames, filenames in os.walk(str(split_dir)): + if any([f.endswith(".mp3") for f in filenames]): + mp3_dirs.add(dirpath) + for mp3_dir in tqdm( + list(mp3_dirs), desc=f"Preparing ru common voice folders: {part}" + ): + torchaudio.set_audio_backend('sox_io') + mp3_dir = Path(mp3_dir) + trans_path = self._data_dir / f"{part}.tsv" + df = pd.read_csv(trans_path, sep='\t') + with cf.ThreadPoolExecutor(max_workers=100) as executor: + future_to_dict = {executor.submit(add_to_index, mp3_dir, row, use_vad): row\ + for _, row in df.iterrows()} + for future in cf.as_completed(future_to_dict): + index.append(future.result()) + return index + + +def add_to_index(mp3_dir, row, use_vad): + m_id = row['path'] + m_text = row['sentence'].strip() + mp3_path = mp3_dir / m_id + if use_vad: + audio_tensor, sr = torchaudio.load(str(mp3_path)) + # Common voice has too much noise and silence and the start and end + audio_tensor = torchaudio.functional.vad(audio_tensor, sr, pre_trigger_time=0.15) # cut leading silence + audio_tensor = torch.flip(audio_tensor, [0, 1]) + audio_tensor = torchaudio.functional.vad(audio_tensor, sr, pre_trigger_time=0.15) # cut ending silence + audio_tensor = torch.flip(audio_tensor, [0, 1]) + mp3_path = Path(str(mp3_path)[:-4] + "_vad.mp3") + torchaudio.save(str(mp3_path), audio_tensor, sr) + + t_info = torchaudio.info(str(mp3_path)) + length = t_info.num_frames / t_info.sample_rate + res_dict= { + "path": str(mp3_path.absolute().resolve()), + "text": m_text.lower(), + "audio_len": length, + } + return res_dict diff --git a/hw_asr/datasets/ru_golos_dataset.py b/hw_asr/datasets/ru_golos_dataset.py new file mode 100644 index 0000000..6c696b7 --- /dev/null +++ b/hw_asr/datasets/ru_golos_dataset.py @@ -0,0 +1,124 @@ +import json +import logging +import os +import shutil +from pathlib import Path + +import jsonlines +import pandas as pd +import torchaudio +from hw_asr.base.base_dataset import BaseDataset +from hw_asr.utils import ROOT_PATH +from regex import R +from speechbrain.utils.data_utils import download_file +from tqdm import tqdm + +logger = logging.getLogger(__name__) + +URL_LINKS = { + "farfield": "https://sc.link/1Z3", + "train_1": "https://sc.link/MvQ", + "train_2": "https://sc.link/NwL", + "train_3": "https://sc.link/Oxg", + "train_4": "https://sc.link/Pyz", + "train_5": "https://sc.link/Qz7", + "train_6": "https://sc.link/RAL", + "train_7": "https://sc.link/VG5", + "train_8": "https://sc.link/WJW", + "train_9": "https://sc.link/XKk", +} + +class GolosDataset(BaseDataset): + def __init__(self, part, names=["crowd7", "crowd8", "crowd9"], data_dir=None, *args, **kwargs): + """ + :param part: which part of dataset to use (only train is supported) + :param names: which part of train split to use (crowd{i} or farfield), + crowd0 is not supported + :param data_dir: Path object with the path to data folder + """ + if data_dir is None: + data_dir = ROOT_PATH / "data" / "datasets" / "ru_golos" + data_dir.mkdir(exist_ok=True, parents=True) + self._data_dir = data_dir + index = self._get_or_load_index(part, names) + + super().__init__(index, *args, **kwargs) + + def _load_dataset(self, name): + print(f"Loading GOLOS_{name}") + + if (self._data_dir / "train" / name).exists(): + return + if (self._data_dir / "train" / "crowd" / f"{name[-1]}").exists(): + return + + if name == "farfield": + url_name = name + else: + url_name = f"train_{name[-1]}" + + arch_path = self._data_dir / f"{url_name}.tar" + if not arch_path.exists(): + download_file(URL_LINKS[url_name], arch_path) + shutil.unpack_archive(arch_path, self._data_dir) + if name[-1] == "9": + shutil.move(str(self._data_dir / "train" / "manifest.jsonl"),\ + str(self._data_dir / "manifest.jsonl")) + os.remove(str(arch_path)) + + def _get_or_load_index(self, part, names): + index_path = self._data_dir / f"{part}_{'_'.join(names)}_index.json" + if index_path.exists(): + with index_path.open() as f: + index = json.load(f) + else: + index = self._create_index(part, names) + with index_path.open("w") as f: + json.dump(index, f, indent=2) + return index + + def _create_index(self, part, names): + index = [] + split_dir = self._data_dir / part + for name in names: + if name == "farfield": + if not (split_dir / name).exists(): + self._load_dataset(name) + elif not (split_dir / "crowd" / f"{name[-1]}").exists(): + self._load_dataset(name) + + wav_dirs = set() + for dirpath, dirnames, filenames in os.walk(str(split_dir)): + if any([f.endswith(".wav") for f in filenames]): + wav_dirs.add(dirpath) + for wav_dir in tqdm( + list(wav_dirs), desc=f"Preparing golos folders: {part}" + ): + wav_dir = Path(wav_dir) + trans_path = self._data_dir / "manifest.jsonl" + assert trans_path.exists(), "download crowd9 first" + with jsonlines.open(str(trans_path)) as reader: + for obj in reader.iter(type=dict): + if "farfield" not in str(wav_dir): + path_check = f"crowd/{str(wav_dir)[-1]}" + if f"crowd{str(wav_dir)[-1]}" not in names: + continue + else: + path_check = "farfield" + if "farfield" not in names: + continue + if path_check not in obj["audio_filepath"]: + continue + w_id = obj['id'] + ".wav" + w_text = obj['text'].strip() + wav_path = wav_dir / w_id + t_info = torchaudio.info(str(wav_path)) + length = t_info.num_frames / t_info.sample_rate + index.append( + { + "path": str(wav_path.absolute().resolve()), + "text": w_text.lower(), + "audio_len": length, + } + ) + return index From eb9260d0a8646b183aabbbdd4e3f114ed4d8edf5 Mon Sep 17 00:00:00 2001 From: Blinorot Date: Sun, 20 Nov 2022 00:18:56 +0300 Subject: [PATCH 2/3] Requirements Fix --- hw_asr/datasets/ru_golos_dataset.py | 1 - requirements.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/hw_asr/datasets/ru_golos_dataset.py b/hw_asr/datasets/ru_golos_dataset.py index 6c696b7..1a89bce 100644 --- a/hw_asr/datasets/ru_golos_dataset.py +++ b/hw_asr/datasets/ru_golos_dataset.py @@ -9,7 +9,6 @@ import torchaudio from hw_asr.base.base_dataset import BaseDataset from hw_asr.utils import ROOT_PATH -from regex import R from speechbrain.utils.data_utils import download_file from tqdm import tqdm diff --git a/requirements.txt b/requirements.txt index 3ff2b33..be6856f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ wandb pyctcdecode torchaudio~=0.11.0 pillow +jsonlines \ No newline at end of file From 85a78fb6064e309e4375756fdd9beddc6a76a1fd Mon Sep 17 00:00:00 2001 From: Blinorot Date: Sun, 20 Nov 2022 00:20:46 +0300 Subject: [PATCH 3/3] Init Fix --- hw_asr/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw_asr/datasets/__init__.py b/hw_asr/datasets/__init__.py index dc65069..934639e 100644 --- a/hw_asr/datasets/__init__.py +++ b/hw_asr/datasets/__init__.py @@ -1,3 +1,4 @@ +from hw_asr.datasets.common_voice import CommonVoiceDataset from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset from hw_asr.datasets.librispeech_dataset import LibrispeechDataset @@ -12,4 +13,5 @@ "LJspeechDataset", "RuCommonVoiceDataset", "GolosDataset" + "CommonVoiceDataset" ]