From 6e74865573df6c70efe19c04ba88a461e5a884c5 Mon Sep 17 00:00:00 2001
From: Blinorot <pmgrin.work@gmail.com>
Date: Sat, 19 Nov 2022 23:50:01 +0300
Subject: [PATCH 1/3] Russia Datasets Added

---
 hw_asr/datasets/__init__.py               |   6 +-
 hw_asr/datasets/ljspeech_dataset.py       |   2 +-
 hw_asr/datasets/ru_commonvoice_dataset.py | 126 ++++++++++++++++++++++
 hw_asr/datasets/ru_golos_dataset.py       | 124 +++++++++++++++++++++
 4 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100644 hw_asr/datasets/ru_commonvoice_dataset.py
 create mode 100644 hw_asr/datasets/ru_golos_dataset.py

diff --git a/hw_asr/datasets/__init__.py b/hw_asr/datasets/__init__.py
index 2644f29..dc65069 100644
--- a/hw_asr/datasets/__init__.py
+++ b/hw_asr/datasets/__init__.py
@@ -2,10 +2,14 @@
 from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset
 from hw_asr.datasets.librispeech_dataset import LibrispeechDataset
 from hw_asr.datasets.ljspeech_dataset import LJspeechDataset
+from hw_asr.datasets.ru_commonvoice_dataset import RuCommonVoiceDataset
+from hw_asr.datasets.ru_golos_dataset import GolosDataset
 
 __all__ = [
     "LibrispeechDataset",
     "CustomDirAudioDataset",
     "CustomAudioDataset",
-    "LJspeechDataset"
+    "LJspeechDataset",
+    "RuCommonVoiceDataset",
+    "GolosDataset"
 ]
diff --git a/hw_asr/datasets/ljspeech_dataset.py b/hw_asr/datasets/ljspeech_dataset.py
index adf9430..bc8f575 100644
--- a/hw_asr/datasets/ljspeech_dataset.py
+++ b/hw_asr/datasets/ljspeech_dataset.py
@@ -81,7 +81,7 @@ def _create_index(self, part):
                     w_id = line.split('|')[0]
                     w_text = " ".join(line.split('|')[1:]).strip()
                     wav_path = wav_dir / f"{w_id}.wav"
-                    if not wav_path.exists(): # elem in another part
+                    if not wav_path.exists(): # elem is in another part
                         continue
                     t_info = torchaudio.info(str(wav_path))
                     length = t_info.num_frames / t_info.sample_rate
diff --git a/hw_asr/datasets/ru_commonvoice_dataset.py b/hw_asr/datasets/ru_commonvoice_dataset.py
new file mode 100644
index 0000000..8119d7d
--- /dev/null
+++ b/hw_asr/datasets/ru_commonvoice_dataset.py
@@ -0,0 +1,126 @@
+import concurrent.futures as cf
+import json
+import logging
+import os
+import shutil
+from asyncio import as_completed
+from pathlib import Path
+
+import pandas as pd
+import torch
+import torchaudio
+from hw_asr.base.base_dataset import BaseDataset
+from hw_asr.utils import ROOT_PATH
+from speechbrain.utils.data_utils import download_file
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+class RuCommonVoiceDataset(BaseDataset):
+    def __init__(self, part, data_dir=None, use_vad=False, *args, **kwargs):
+        """
+        :param part: which part of dataset to use
+        :param data_dir: Path objecth with the path to data folder
+        :param use_vad: whether to preprocess all audios with Voice Activity Detector
+            in order to cut silence at the beggining and end of the audio
+        """
+        if data_dir is None:
+            data_dir = ROOT_PATH / "data" / "datasets" / "ru_commonvoice"
+            data_dir.mkdir(exist_ok=True, parents=True)
+        self._data_dir = data_dir
+        index = self._get_or_load_index(part, use_vad)
+
+        super().__init__(index, *args, **kwargs)
+
+    def _load_part(self, part):
+        df = pd.read_csv(str(self._data_dir / f'{part}.tsv'), sep='\t')
+        for _, row in df.iterrows():
+            f_name = row['path']
+            file_path = self._data_dir / 'clips' / f_name
+            shutil.move(str(file_path), str(self._data_dir / part / f_name))
+
+    def _load_dataset(self):
+        arch_path = self._data_dir / "cv-corpus-11.0-2022-09-21-ru.tar.gz"
+
+        # url wget is not supported due to email confirmation needed
+        assert arch_path.exists(), "please download RU Common Voice 11.0 from the official website"
+        print(f"Loading RU Common Voice 11.0")
+
+        shutil.unpack_archive(arch_path, self._data_dir)
+        for fpath in (self._data_dir / "cv-corpus-11.0-2022-09-21/ru").iterdir():
+            shutil.move(str(fpath), str(self._data_dir / fpath.name))
+        os.remove(str(arch_path))
+        shutil.rmtree(str(self._data_dir / "cv-corpus-11.0-2022-09-21"))
+
+        (self._data_dir / "train").mkdir(exist_ok=True, parents=True)
+        (self._data_dir / "dev").mkdir(exist_ok=True, parents=True)
+        (self._data_dir / "test").mkdir(exist_ok=True, parents=True)
+        
+        self._load_part("train")
+        self._load_part("dev")
+        self._load_part("test")
+
+        shutil.rmtree(str(self._data_dir / "clips"))
+
+
+    def _get_or_load_index(self, part, use_vad):
+        if use_vad:
+            index_path = self._data_dir / f"{part}_vad_index.json"
+        else:
+            index_path = self._data_dir / f"{part}_index.json"
+        if index_path.exists():
+            with index_path.open() as f:
+                index = json.load(f)
+        else:
+            index = self._create_index(part, use_vad)
+            with index_path.open("w") as f:
+                json.dump(index, f, indent=2)
+        return index
+
+    def _create_index(self, part, use_vad):
+        index = []
+        split_dir = self._data_dir / part
+        if not split_dir.exists():
+            self._load_dataset()
+
+        mp3_dirs = set()
+        for dirpath, dirnames, filenames in os.walk(str(split_dir)):
+            if any([f.endswith(".mp3") for f in filenames]):
+                mp3_dirs.add(dirpath)
+        for mp3_dir in tqdm(
+                list(mp3_dirs), desc=f"Preparing ru common voice folders: {part}"
+        ):
+            torchaudio.set_audio_backend('sox_io')
+            mp3_dir = Path(mp3_dir)
+            trans_path = self._data_dir / f"{part}.tsv"
+            df = pd.read_csv(trans_path, sep='\t')
+            with cf.ThreadPoolExecutor(max_workers=100) as executor: 
+                future_to_dict =  {executor.submit(add_to_index, mp3_dir, row, use_vad): row\
+                                   for _, row in df.iterrows()}
+                for future in cf.as_completed(future_to_dict):
+                    index.append(future.result())
+        return index    
+
+
+def add_to_index(mp3_dir, row, use_vad):
+    m_id = row['path']
+    m_text = row['sentence'].strip()
+    mp3_path = mp3_dir / m_id
+    if use_vad:
+        audio_tensor, sr = torchaudio.load(str(mp3_path))
+        # Common voice has too much noise and silence and the start and end
+        audio_tensor = torchaudio.functional.vad(audio_tensor, sr, pre_trigger_time=0.15) # cut leading silence
+        audio_tensor = torch.flip(audio_tensor, [0, 1])
+        audio_tensor = torchaudio.functional.vad(audio_tensor, sr, pre_trigger_time=0.15) # cut ending silence
+        audio_tensor = torch.flip(audio_tensor, [0, 1])
+        mp3_path = Path(str(mp3_path)[:-4] + "_vad.mp3")
+        torchaudio.save(str(mp3_path), audio_tensor, sr)
+
+    t_info = torchaudio.info(str(mp3_path))
+    length = t_info.num_frames / t_info.sample_rate
+    res_dict= {
+              "path": str(mp3_path.absolute().resolve()),
+              "text": m_text.lower(),
+              "audio_len": length,
+    }
+    return res_dict
diff --git a/hw_asr/datasets/ru_golos_dataset.py b/hw_asr/datasets/ru_golos_dataset.py
new file mode 100644
index 0000000..6c696b7
--- /dev/null
+++ b/hw_asr/datasets/ru_golos_dataset.py
@@ -0,0 +1,124 @@
+import json
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import pandas as pd
+import torchaudio
+from hw_asr.base.base_dataset import BaseDataset
+from hw_asr.utils import ROOT_PATH
+from regex import R
+from speechbrain.utils.data_utils import download_file
+from tqdm import tqdm
+
+logger = logging.getLogger(__name__)
+
+URL_LINKS = {
+    "farfield": "https://sc.link/1Z3",
+    "train_1": "https://sc.link/MvQ",
+    "train_2": "https://sc.link/NwL",
+    "train_3": "https://sc.link/Oxg",
+    "train_4": "https://sc.link/Pyz",
+    "train_5": "https://sc.link/Qz7",
+    "train_6": "https://sc.link/RAL",
+    "train_7": "https://sc.link/VG5",
+    "train_8": "https://sc.link/WJW",
+    "train_9": "https://sc.link/XKk", 
+}
+
+class GolosDataset(BaseDataset):
+    def __init__(self, part, names=["crowd7", "crowd8", "crowd9"], data_dir=None, *args, **kwargs):
+        """
+        :param part: which part of dataset to use (only train is supported)
+        :param names: which part of train split to use (crowd{i} or farfield),
+            crowd0 is not supported
+        :param data_dir: Path object with the path to data folder
+        """
+        if data_dir is None:
+            data_dir = ROOT_PATH / "data" / "datasets" / "ru_golos"
+            data_dir.mkdir(exist_ok=True, parents=True)
+        self._data_dir = data_dir
+        index = self._get_or_load_index(part, names)
+
+        super().__init__(index, *args, **kwargs)
+
+    def _load_dataset(self, name):
+        print(f"Loading GOLOS_{name}")
+
+        if (self._data_dir / "train" / name).exists():
+            return
+        if (self._data_dir / "train" / "crowd" / f"{name[-1]}").exists():
+            return
+
+        if name == "farfield":
+            url_name = name
+        else:
+            url_name = f"train_{name[-1]}"
+
+        arch_path = self._data_dir / f"{url_name}.tar"
+        if not arch_path.exists():
+            download_file(URL_LINKS[url_name], arch_path)
+            shutil.unpack_archive(arch_path, self._data_dir)
+        if name[-1] == "9":
+            shutil.move(str(self._data_dir / "train" / "manifest.jsonl"),\
+                        str(self._data_dir / "manifest.jsonl"))
+        os.remove(str(arch_path))
+
+    def _get_or_load_index(self, part, names):
+        index_path = self._data_dir / f"{part}_{'_'.join(names)}_index.json"
+        if index_path.exists():
+            with index_path.open() as f:
+                index = json.load(f)
+        else:
+            index = self._create_index(part, names)
+            with index_path.open("w") as f:
+                json.dump(index, f, indent=2)
+        return index
+
+    def _create_index(self, part, names):
+        index = []
+        split_dir = self._data_dir / part
+        for name in names:
+            if name == "farfield":
+                if not (split_dir / name).exists():
+                    self._load_dataset(name)
+            elif not (split_dir / "crowd" / f"{name[-1]}").exists():
+                self._load_dataset(name)
+
+        wav_dirs = set()
+        for dirpath, dirnames, filenames in os.walk(str(split_dir)):
+            if any([f.endswith(".wav") for f in filenames]):
+                wav_dirs.add(dirpath)
+        for wav_dir in tqdm(
+                list(wav_dirs), desc=f"Preparing golos folders: {part}"
+        ):
+            wav_dir = Path(wav_dir)
+            trans_path = self._data_dir / "manifest.jsonl"
+            assert trans_path.exists(), "download crowd9 first"
+            with jsonlines.open(str(trans_path)) as reader:
+                for obj in reader.iter(type=dict):
+                    if "farfield" not in str(wav_dir):
+                        path_check = f"crowd/{str(wav_dir)[-1]}"
+                        if f"crowd{str(wav_dir)[-1]}" not in names:
+                            continue
+                    else:
+                        path_check = "farfield"
+                        if "farfield" not in names:
+                            continue
+                    if  path_check not in obj["audio_filepath"]:
+                        continue
+                    w_id = obj['id'] + ".wav"
+                    w_text = obj['text'].strip()
+                    wav_path = wav_dir / w_id
+                    t_info = torchaudio.info(str(wav_path))
+                    length = t_info.num_frames / t_info.sample_rate
+                    index.append(
+                        {
+                            "path": str(wav_path.absolute().resolve()),
+                            "text": w_text.lower(),
+                            "audio_len": length,
+                        }
+                    )
+        return index

From eb9260d0a8646b183aabbbdd4e3f114ed4d8edf5 Mon Sep 17 00:00:00 2001
From: Blinorot <pmgrin.work@gmail.com>
Date: Sun, 20 Nov 2022 00:18:56 +0300
Subject: [PATCH 2/3] Requirements Fix

---
 hw_asr/datasets/ru_golos_dataset.py | 1 -
 requirements.txt                    | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw_asr/datasets/ru_golos_dataset.py b/hw_asr/datasets/ru_golos_dataset.py
index 6c696b7..1a89bce 100644
--- a/hw_asr/datasets/ru_golos_dataset.py
+++ b/hw_asr/datasets/ru_golos_dataset.py
@@ -9,7 +9,6 @@
 import torchaudio
 from hw_asr.base.base_dataset import BaseDataset
 from hw_asr.utils import ROOT_PATH
-from regex import R
 from speechbrain.utils.data_utils import download_file
 from tqdm import tqdm
 
diff --git a/requirements.txt b/requirements.txt
index 3ff2b33..be6856f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ wandb
 pyctcdecode
 torchaudio~=0.11.0
 pillow
+jsonlines
\ No newline at end of file

From 85a78fb6064e309e4375756fdd9beddc6a76a1fd Mon Sep 17 00:00:00 2001
From: Blinorot <pmgrin.work@gmail.com>
Date: Sun, 20 Nov 2022 00:20:46 +0300
Subject: [PATCH 3/3] Init Fix

---
 hw_asr/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw_asr/datasets/__init__.py b/hw_asr/datasets/__init__.py
index dc65069..934639e 100644
--- a/hw_asr/datasets/__init__.py
+++ b/hw_asr/datasets/__init__.py
@@ -1,3 +1,4 @@
+from hw_asr.datasets.common_voice import CommonVoiceDataset
 from hw_asr.datasets.custom_audio_dataset import CustomAudioDataset
 from hw_asr.datasets.custom_dir_audio_dataset import CustomDirAudioDataset
 from hw_asr.datasets.librispeech_dataset import LibrispeechDataset
@@ -12,4 +13,5 @@
     "LJspeechDataset",
     "RuCommonVoiceDataset",
     "GolosDataset"
+    "CommonVoiceDataset"
 ]