From f1e0600d581a0756c80c8974522e87950dc63506 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Wed, 29 Apr 2020 17:14:07 +0900
Subject: [PATCH 1/9] fusion of two models

---
 batcher.py      |  5 ++--
 cli.py          | 10 +++++--
 eval_metrics.py |  5 ----
 test.py         | 77 ++++++++++++++++++++++++++++++++++---------------
 train.py        | 18 ++++++++----
 utils.py        | 14 +++++++++
 6 files changed, 90 insertions(+), 39 deletions(-)

diff --git a/batcher.py b/batcher.py
index 78fe129..c932ffc 100644
--- a/batcher.py
+++ b/batcher.py
@@ -147,8 +147,9 @@ def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
         self.history_model_inputs = None
 
         self.batch_count = 0
-        for _ in tqdm(range(self.history_length), desc='Initializing the batcher'):  # init history.
-            self.update_triplets_history()
+        if self.model is not None:
+            for _ in tqdm(range(self.history_length), desc='Initializing the batcher'):  # init history.
+                self.update_triplets_history()
 
     def update_triplets_history(self):
         model_inputs = []
diff --git a/cli.py b/cli.py
index e0a1a4b..ceb7b4c 100644
--- a/cli.py
+++ b/cli.py
@@ -3,6 +3,7 @@
 
 import logging
 import os
+import sys
 
 import click
 
@@ -22,7 +23,9 @@
 
 @click.group()
 def cli():
-    logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO)
+    logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s',
+                        level=logging.INFO,
+                        stream=sys.stdout)
     init_pandas()
 
 
@@ -54,8 +57,8 @@ def build_keras_inputs(working_dir, counts_per_speaker):
 
 @cli.command('test-model', short_help='Test a Keras model.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
-@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
-@click.option('--checkpoint_file', required=True, type=Ct.input_file())
+@click.option('--model_name', multiple=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
+@click.option('--checkpoint_file', multiple=True, required=True, type=Ct.input_file())
 def test_model(working_dir, model_name, checkpoint_file):
     # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model
     # --working_dir /home/philippe/ds-test/triplet-training/
@@ -66,6 +69,7 @@ def test_model(working_dir, model_name, checkpoint_file):
     # --working_dir /home/philippe/ds-test/triplet-training/
     # --checkpoint_file ../ds-test/checkpoints-triplets/ResCNN_checkpoint_175.h5
     # f-measure = 0.849, true positive rate = 0.798, accuracy = 0.997, equal error rate = 0.025
+    assert len(model_name) == len(checkpoint_file)
     test(working_dir, model_name, checkpoint_file)
 
 
diff --git a/eval_metrics.py b/eval_metrics.py
index f9ed8f2..c244b20 100644
--- a/eval_metrics.py
+++ b/eval_metrics.py
@@ -10,7 +10,6 @@ def evaluate(sims, labels):
 
 
 def calculate_roc(thresholds, sims, labels):
-    nrof_pairs = min(len(labels), len(sims))
     nrof_thresholds = len(thresholds)
 
     tprs = np.zeros((nrof_thresholds))
@@ -18,10 +17,6 @@ def calculate_roc(thresholds, sims, labels):
     acc_train = np.zeros((nrof_thresholds))
     precisions = np.zeros((nrof_thresholds))
     fms = np.zeros((nrof_thresholds))
-    accuracy = 0.0
-
-    indices = np.arange(nrof_pairs)
-
     # Find the best threshold for the fold
 
     for threshold_idx, threshold in enumerate(thresholds):
diff --git a/test.py b/test.py
index 3e01af0..a2692d2 100644
--- a/test.py
+++ b/test.py
@@ -1,4 +1,5 @@
 import logging
+from typing import List
 
 import numpy as np
 from tqdm import tqdm
@@ -6,12 +7,15 @@
 from audio import Audio
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
-from models import ResCNNModel, select_model_class
 from eval_metrics import evaluate
-from utils import enable_deterministic
+from models import ResCNNModel, select_model_class
+from utils import enable_deterministic, score_fusion, embedding_fusion
 
 logger = logging.getLogger(__name__)
 
+EMBEDDING_FUSION = 0
+SCORE_FUSION = 1
+
 
 def batch_cosine_similarity(x1, x2):
     # https://en.wikipedia.org/wiki/Cosine_similarity
@@ -25,38 +29,63 @@ def batch_cosine_similarity(x1, x2):
     return s
 
 
-def eval_model(working_dir: str, model: ResCNNModel):
+def eval_models(working_dir: str, models: List[ResCNNModel]):
+    if len(models) > 1:  # multiple models -> fusion of results.
+        y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models])
+        y_pred_emb_fusion = run_speaker_verification_task(working_dir, models)
+        assert y_pred_score_fusion.shape == y_pred_emb_fusion.shape
+        y_true = np.zeros_like(y_pred_score_fusion)  # positive is at index 0.
+        y_true[:, 0] = 1.0
+        fm_1, tpr_1, acc_1, eer_1 = evaluate(y_pred_score_fusion, y_true)
+        fm_2, tpr_2, acc_2, eer_2 = evaluate(y_pred_emb_fusion, y_true)
+        logger.info(f'[score fusion] f-measure = {fm_1:.3f}, true positive rate = {tpr_1:.3f}, '
+                    f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}')
+        logger.info(f'[emb fusion] f-measure = {fm_2:.3f}, true positive rate = {tpr_2:.3f}, '
+                    f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}')
+    else:
+        y_pred = run_speaker_verification_task(working_dir, models)
+        y_true = np.zeros_like(y_pred)  # positive is at index 0.
+        y_true[:, 0] = 1.0
+        fm, tpr, acc, eer = evaluate(y_pred, y_true)
+        logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
+                    f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
+
+
+def run_speaker_verification_task(working_dir, model):
     enable_deterministic()
+    embeddings_fusion_cond = isinstance(model, list)
+    if embeddings_fusion_cond:
+        assert len(model) == 2
     audio = Audio(working_dir)
-    batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model)
-    speakers_list = list(audio.speakers_to_utterances.keys())
+    batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model=None)
     num_negative_speakers = 99
-    num_speakers = len(speakers_list)
+    num_speakers = len(audio.speaker_ids)
     y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1))  # negatives + positive
-    for i, positive_speaker in tqdm(enumerate(speakers_list), desc='test', total=num_speakers):
+    for i, positive_speaker in tqdm(enumerate(audio.speaker_ids), desc='test', total=num_speakers):
         # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative.
         input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers)
         # batch size is not relevant. just making sure we don't push too much on the GPU.
-        predictions = model.m.predict(input_data, batch_size=BATCH_SIZE)
+        if embeddings_fusion_cond:
+            predictions = embedding_fusion(model[0].m.predict(input_data, batch_size=BATCH_SIZE),
+                                           model[1].m.predict(input_data, batch_size=BATCH_SIZE))
+        else:
+            predictions = model.m.predict(input_data, batch_size=BATCH_SIZE)
         anchor_embedding = predictions[0]
         for j, other_than_anchor_embedding in enumerate(predictions[1:]):  # positive + negatives
             y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0]
-    y_true = np.zeros_like(y_pred)  # positive is at index 0.
-    y_true[:, 0] = 1.0
-    fm, tpr, acc, eer = evaluate(y_pred, y_true)
-    return fm, tpr, acc, eer
+    return y_pred
 
 
-def test(working_dir, model_name, checkpoint_file):
+def test(working_dir, model_names: tuple, checkpoint_files: tuple):
     batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1]
-    dsm = select_model_class(model_name)(batch_input_shape)
-    if checkpoint_file is not None:
-        logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...')
-        dsm.m.load_weights(checkpoint_file, by_name=True)
-    else:
-        logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
-        exit(1)
-
-    fm, tpr, acc, eer = eval_model(working_dir, model=dsm)
-    logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
-                f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
+    models = []
+    for checkpoint_file, model_name in zip(checkpoint_files, model_names):
+        dsm = select_model_class(model_name)(batch_input_shape)
+        if checkpoint_file is not None:
+            logger.info(f'Found checkpoint [{checkpoint_file}] for [{model_name}]. Loading weights...')
+            dsm.m.load_weights(checkpoint_file, by_name=True)
+        else:
+            logger.info(f'Could not find any checkpoint in {checkpoint_file}.')
+            exit(1)
+        models.append(dsm)
+    eval_models(working_dir, models)
diff --git a/train.py b/train.py
index c9f2695..50202b1 100644
--- a/train.py
+++ b/train.py
@@ -7,7 +7,7 @@
 
 from batcher import KerasFormatConverter, LazyTripletBatcher
 from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS
-from models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME
+from models import DeepSpeakerModel, select_model_class, RES_CNN_NAME
 from triplet_loss import deep_speaker_loss
 from utils import load_best_checkpoint, ensures_dir
 
@@ -17,7 +17,8 @@
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 
 
-def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE):
+def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES,
+              batch_size: int = BATCH_SIZE, initial_epoch: int = 0):
     batcher = LazyTripletBatcher(working_dir, max_length, dsm)
 
     # build small test set.
@@ -37,9 +38,10 @@ def train_generator():
     checkpoint_name = dsm.m.name + '_checkpoint'
     checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5')
     checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True)
+    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-4, verbose=1)
     dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False,
               epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches),
-              callbacks=[checkpoint])
+              callbacks=[reduce_lr, checkpoint], initial_epoch=initial_epoch)
 
 
 def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test,
@@ -83,7 +85,7 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         num_speakers_softmax = len(kc.categorical_speakers.speaker_ids)
         dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax)
         # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive.
-        lr = 0.001 if model_name == RES_CNN_NAME else 0.00005
+        lr = 0.001 if model_name == RES_CNN_NAME else 0.00003
         logger.info(f'Initial learning rate set to {lr}.')
         dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
@@ -102,12 +104,18 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR)
         if triplet_checkpoint is not None:
             logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.')
+            initial_epoch = int(triplet_checkpoint.split('/')[-1].split('.')[0].split('_')[-1])
             dsm.m.load_weights(triplet_checkpoint)
         elif pre_training_checkpoint is not None:
             logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.')
             # If `by_name` is True, weights are loaded into layers only if they share the
             # same name. This is useful for fine-tuning or transfer-learning models where
             # some of the layers have changed.
+            initial_epoch = 0
             dsm.m.load_weights(pre_training_checkpoint, by_name=True)
+        else:
+            initial_epoch = 0
+        dsm.m.summary()
+        # TODO: should replace by SGD(learning_rate=0.05, momentum=0.99)
         dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
-        fit_model(dsm, working_dir, NUM_FRAMES)
+        fit_model(dsm, working_dir, max_length=NUM_FRAMES, initial_epoch=initial_epoch)
diff --git a/utils.py b/utils.py
index 6eb35e1..b55b3d9 100644
--- a/utils.py
+++ b/utils.py
@@ -118,3 +118,17 @@ def train_test_sp_to_utt(audio, is_test):
         train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
         sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
     return sp_to_utt
+
+
+def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array):
+    assert len(embeddings_1.shape) == 2  # (batch_size, 512).
+    assert embeddings_1.shape == embeddings_2.shape
+    fusion = np.linalg.norm(embeddings_1 + embeddings_2, ord=2, axis=1)
+    return fusion
+
+
+def score_fusion(scores_1: np.array, scores_2: np.array):
+    def normalize_scores(m, epsilon=1e-12):
+        return (m - np.mean(m)) / max(np.std(m), epsilon)
+
+    return normalize_scores(scores_1) + normalize_scores(scores_2)

From aa6cfcdae07c86e3c78b4e4f7e68c0a47da7ceec Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 11:19:07 +0900
Subject: [PATCH 2/9] better fusion

---
 test.py  | 17 ++++++++++-------
 utils.py | 14 +++++++++++---
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/test.py b/test.py
index a2692d2..5a31a51 100644
--- a/test.py
+++ b/test.py
@@ -30,7 +30,7 @@ def batch_cosine_similarity(x1, x2):
 
 
 def eval_models(working_dir: str, models: List[ResCNNModel]):
-    if len(models) > 1:  # multiple models -> fusion of results.
+    if isinstance(models, list) and len(models) > 1:  # multiple models -> fusion of results.
         y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models])
         y_pred_emb_fusion = run_speaker_verification_task(working_dir, models)
         assert y_pred_score_fusion.shape == y_pred_emb_fusion.shape
@@ -38,17 +38,17 @@ def eval_models(working_dir: str, models: List[ResCNNModel]):
         y_true[:, 0] = 1.0
         fm_1, tpr_1, acc_1, eer_1 = evaluate(y_pred_score_fusion, y_true)
         fm_2, tpr_2, acc_2, eer_2 = evaluate(y_pred_emb_fusion, y_true)
-        logger.info(f'[score fusion] f-measure = {fm_1:.3f}, true positive rate = {tpr_1:.3f}, '
-                    f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}')
-        logger.info(f'[emb fusion] f-measure = {fm_2:.3f}, true positive rate = {tpr_2:.3f}, '
-                    f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}')
+        logger.info(f'[score fusion] f-measure = {fm_1:.5f}, true positive rate = {tpr_1:.5f}, '
+                    f'accuracy = {acc_1:.5f}, equal error rate = {eer_1:.5f}')
+        logger.info(f'[emb fusion] f-measure = {fm_2:.5f}, true positive rate = {tpr_2:.5f}, '
+                    f'accuracy = {acc_2:.5f}, equal error rate = {eer_2:.5f}')
     else:
         y_pred = run_speaker_verification_task(working_dir, models)
         y_true = np.zeros_like(y_pred)  # positive is at index 0.
         y_true[:, 0] = 1.0
         fm, tpr, acc, eer = evaluate(y_pred, y_true)
-        logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, '
-                    f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}')
+        logger.info(f'[single] f-measure = {fm:.5f}, true positive rate = {tpr:.5f}, '
+                    f'accuracy = {acc:.5f}, equal error rate = {eer:.5f}')
 
 
 def run_speaker_verification_task(working_dir, model):
@@ -89,3 +89,6 @@ def test(working_dir, model_names: tuple, checkpoint_files: tuple):
             exit(1)
         models.append(dsm)
     eval_models(working_dir, models)
+    if len(models) > 1:
+        for model in models:
+            eval_models(working_dir, model)
diff --git a/utils.py b/utils.py
index b55b3d9..8d7aa68 100644
--- a/utils.py
+++ b/utils.py
@@ -123,12 +123,20 @@ def train_test_sp_to_utt(audio, is_test):
 def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array):
     assert len(embeddings_1.shape) == 2  # (batch_size, 512).
     assert embeddings_1.shape == embeddings_2.shape
-    fusion = np.linalg.norm(embeddings_1 + embeddings_2, ord=2, axis=1)
+    embeddings_sum = embeddings_1 + embeddings_2
+    fusion = embeddings_sum / np.linalg.norm(embeddings_sum, ord=2, axis=1, keepdims=True)
+    assert np.all((-1 <= fusion) & (fusion <= 1))
+    assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6)
     return fusion
 
-
 def score_fusion(scores_1: np.array, scores_2: np.array):
     def normalize_scores(m, epsilon=1e-12):
         return (m - np.mean(m)) / max(np.std(m), epsilon)
 
-    return normalize_scores(scores_1) + normalize_scores(scores_2)
+    # score has to be between -1 and 1.
+    return np.tanh(np.sum(normalize_scores(np.stack((scores_1, scores_2), axis=2)), axis=2))
+
+
+if __name__ == '__main__':
+    score_fusion(np.ones((5, 100)), np.ones((5, 100)))
+

From c535362af41ca238602a7c2ea1a2f8fd5daf035b Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 12:21:58 +0900
Subject: [PATCH 3/9] fix test+utils

---
 batcher.py      |  9 ++++++---
 eval_metrics.py | 26 ++++++++++++++++++++++++++
 test.py         | 20 ++++++++++----------
 utils.py        | 11 +++++------
 4 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/batcher.py b/batcher.py
index c932ffc..145856a 100644
--- a/batcher.py
+++ b/batcher.py
@@ -12,7 +12,7 @@
 from audio import pad_mfcc, Audio
 from constants import NUM_FRAMES, NUM_FBANKS
 from models import DeepSpeakerModel
-from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
+from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt, enable_deterministic
 
 logger = logging.getLogger(__name__)
 
@@ -317,14 +317,17 @@ def get_batch_train(self, batch_size):
 
         return batch_x, batch_y
 
-    def get_speaker_verification_data(self, anchor_speaker, num_different_speakers):
-        speakers = list(self.audio.speakers_to_utterances.keys())
+    def get_speaker_verification_data(self, anchor_speaker, num_different_speakers, seed=123):
+        speakers = list(self.audio.speaker_ids)
         anchor_utterances = []
         positive_utterances = []
         negative_utterances = []
+        np.random.seed(seed)
         negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers)
         assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers]
+        np.random.seed(seed)
         pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False)
+        np.random.seed(seed)
         neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers]
         anchor_utterances.append(pos_utterances[0])
         positive_utterances.append(pos_utterances[1])
diff --git a/eval_metrics.py b/eval_metrics.py
index c244b20..7e42f6d 100644
--- a/eval_metrics.py
+++ b/eval_metrics.py
@@ -1,4 +1,30 @@
 import numpy as np
+from scipy.interpolate import interp1d
+from scipy.optimize import brentq
+from sklearn.metrics import roc_curve, f1_score, precision_score, accuracy_score
+
+
+def evaluate2(y_pred, y_true):
+    fpr, tpr, threshold = roc_curve(y_true, y_pred, pos_label=1)
+    fnr = 1 - tpr
+    eer1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer3 = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
+
+    fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
+
+    thresholds = np.arange(-1, 1, 0.01)
+    best_index = np.argmax([f1_score(y_true, y_pred > t) for t in thresholds])
+    t = thresholds[best_index]
+    f1 = f1_score(y_true, y_pred > t)
+    precision = precision_score(y_true, y_pred > t)
+    # roc = roc_auc_score(y_true, y_pred > t)
+    acc = accuracy_score(y_true, y_pred > t)
+    # recall = recall_score(y_true, y_pred > t)
+
+    assert abs(eer1 - eer2) <= 1e-2
+    assert abs(eer2 - eer3) <= 1e-2
+    return f1, precision, acc, eer1
 
 
 def evaluate(sims, labels):
diff --git a/test.py b/test.py
index 5a31a51..fe75d72 100644
--- a/test.py
+++ b/test.py
@@ -7,9 +7,9 @@
 from audio import Audio
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
-from eval_metrics import evaluate
+from eval_metrics import evaluate2
 from models import ResCNNModel, select_model_class
-from utils import enable_deterministic, score_fusion, embedding_fusion
+from utils import score_fusion, embedding_fusion
 
 logger = logging.getLogger(__name__)
 
@@ -29,30 +29,30 @@ def batch_cosine_similarity(x1, x2):
     return s
 
 
-def eval_models(working_dir: str, models: List[ResCNNModel]):
+def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate2):
     if isinstance(models, list) and len(models) > 1:  # multiple models -> fusion of results.
         y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models])
         y_pred_emb_fusion = run_speaker_verification_task(working_dir, models)
         assert y_pred_score_fusion.shape == y_pred_emb_fusion.shape
         y_true = np.zeros_like(y_pred_score_fusion)  # positive is at index 0.
         y_true[:, 0] = 1.0
-        fm_1, tpr_1, acc_1, eer_1 = evaluate(y_pred_score_fusion, y_true)
-        fm_2, tpr_2, acc_2, eer_2 = evaluate(y_pred_emb_fusion, y_true)
+        fm_1, tpr_1, acc_1, eer_1 = eval(y_pred_score_fusion, y_true)
+        fm_2, tpr_2, acc_2, eer_2 = eval(y_pred_emb_fusion, y_true)
         logger.info(f'[score fusion] f-measure = {fm_1:.5f}, true positive rate = {tpr_1:.5f}, '
-                    f'accuracy = {acc_1:.5f}, equal error rate = {eer_1:.5f}')
+                    f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}')
         logger.info(f'[emb fusion] f-measure = {fm_2:.5f}, true positive rate = {tpr_2:.5f}, '
-                    f'accuracy = {acc_2:.5f}, equal error rate = {eer_2:.5f}')
+                    f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}')
     else:
         y_pred = run_speaker_verification_task(working_dir, models)
         y_true = np.zeros_like(y_pred)  # positive is at index 0.
         y_true[:, 0] = 1.0
-        fm, tpr, acc, eer = evaluate(y_pred, y_true)
+        fm, tpr, acc, eer = eval(y_pred, y_true)
         logger.info(f'[single] f-measure = {fm:.5f}, true positive rate = {tpr:.5f}, '
                     f'accuracy = {acc:.5f}, equal error rate = {eer:.5f}')
 
 
 def run_speaker_verification_task(working_dir, model):
-    enable_deterministic()
+    seed = 123
     embeddings_fusion_cond = isinstance(model, list)
     if embeddings_fusion_cond:
         assert len(model) == 2
@@ -63,7 +63,7 @@ def run_speaker_verification_task(working_dir, model):
     y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1))  # negatives + positive
     for i, positive_speaker in tqdm(enumerate(audio.speaker_ids), desc='test', total=num_speakers):
         # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative.
-        input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers)
+        input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers, seed=i * seed)
         # batch size is not relevant. just making sure we don't push too much on the GPU.
         if embeddings_fusion_cond:
             predictions = embedding_fusion(model[0].m.predict(input_data, batch_size=BATCH_SIZE),
diff --git a/utils.py b/utils.py
index 8d7aa68..b28c230 100644
--- a/utils.py
+++ b/utils.py
@@ -90,10 +90,9 @@ def delete_older_checkpoints(checkpoint_dir, max_to_keep=5):
             os.remove(checkpoint)
 
 
-def enable_deterministic():
-    print('Deterministic mode enabled.')
-    np.random.seed(123)
-    random.seed(123)
+def enable_deterministic(seed=123):
+    np.random.seed(seed)
+    random.seed(seed)
 
 
 def load_pickle(file):
@@ -129,6 +128,7 @@ def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array):
     assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6)
     return fusion
 
+
 def score_fusion(scores_1: np.array, scores_2: np.array):
     def normalize_scores(m, epsilon=1e-12):
         return (m - np.mean(m)) / max(np.std(m), epsilon)
@@ -138,5 +138,4 @@ def normalize_scores(m, epsilon=1e-12):
 
 
 if __name__ == '__main__':
-    score_fusion(np.ones((5, 100)), np.ones((5, 100)))
-
+    score_fusion(np.random.uniform(low=-1, high=1, size=(5, 100)), np.random.uniform(low=-1, high=1, size=(5, 100)))

From 2c260d7f1b7ab1bbe1757a33a7f30c3c67594108 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 12:24:55 +0900
Subject: [PATCH 4/9] seed

---
 batcher.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/batcher.py b/batcher.py
index 145856a..bb56431 100644
--- a/batcher.py
+++ b/batcher.py
@@ -1,8 +1,8 @@
 import json
 import logging
 import os
+import random
 from collections import deque, Counter
-from random import choice
 from time import time
 
 import dill
@@ -12,7 +12,7 @@
 from audio import pad_mfcc, Audio
 from constants import NUM_FRAMES, NUM_FBANKS
 from models import DeepSpeakerModel
-from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt, enable_deterministic
+from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
 
 logger = logging.getLogger(__name__)
 
@@ -21,18 +21,19 @@ def extract_speaker(utt_file):
     return utt_file.split('/')[-1].split('_')[0]
 
 
-def sample_from_mfcc(mfcc, max_length):
+def sample_from_mfcc(mfcc, max_length, seed=None):
     if mfcc.shape[0] >= max_length:
-        r = choice(range(0, len(mfcc) - max_length + 1))
+        random.seed(seed)
+        r = random.choice(range(0, len(mfcc) - max_length + 1))
         s = mfcc[r:r + max_length]
     else:
         s = pad_mfcc(mfcc, max_length)
     return np.expand_dims(s, axis=-1)
 
 
-def sample_from_mfcc_file(utterance_file, max_length):
+def sample_from_mfcc_file(utterance_file, max_length, seed):
     mfcc = np.load(utterance_file)
-    return sample_from_mfcc(mfcc, max_length)
+    return sample_from_mfcc(mfcc, max_length, seed)
 
 
 class KerasFormatConverter:
@@ -340,9 +341,9 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers,
             [extract_speaker(s) for s in anc_pos[1, :]]))
 
         batch_x = np.vstack([
-            [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances],
-            [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances],
-            [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances]
+            [sample_from_mfcc_file(u, self.max_length, seed) for u in anchor_utterances],
+            [sample_from_mfcc_file(u, self.max_length, seed) for u in positive_utterances],
+            [sample_from_mfcc_file(u, self.max_length, seed) for u in negative_utterances]
         ])
 
         batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.

From 0c18c272b544efbee7c524d41236e438fcf3a2f6 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 13:05:06 +0900
Subject: [PATCH 5/9] seed

---
 batcher.py      | 2 +-
 eval_metrics.py | 2 --
 test.py         | 4 ++--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/batcher.py b/batcher.py
index bb56431..9a8ad1c 100644
--- a/batcher.py
+++ b/batcher.py
@@ -31,7 +31,7 @@ def sample_from_mfcc(mfcc, max_length, seed=None):
     return np.expand_dims(s, axis=-1)
 
 
-def sample_from_mfcc_file(utterance_file, max_length, seed):
+def sample_from_mfcc_file(utterance_file, max_length, seed=None):
     mfcc = np.load(utterance_file)
     return sample_from_mfcc(mfcc, max_length, seed)
 
diff --git a/eval_metrics.py b/eval_metrics.py
index 7e42f6d..cdbc9b6 100644
--- a/eval_metrics.py
+++ b/eval_metrics.py
@@ -11,8 +11,6 @@ def evaluate2(y_pred, y_true):
     eer2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))]
     eer3 = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
 
-    fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
-
     thresholds = np.arange(-1, 1, 0.01)
     best_index = np.argmax([f1_score(y_true, y_pred > t) for t in thresholds])
     t = thresholds[best_index]
diff --git a/test.py b/test.py
index fe75d72..f31d90e 100644
--- a/test.py
+++ b/test.py
@@ -7,7 +7,7 @@
 from audio import Audio
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
-from eval_metrics import evaluate2
+from eval_metrics import evaluate2, evaluate
 from models import ResCNNModel, select_model_class
 from utils import score_fusion, embedding_fusion
 
@@ -29,7 +29,7 @@ def batch_cosine_similarity(x1, x2):
     return s
 
 
-def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate2):
+def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate):
     if isinstance(models, list) and len(models) > 1:  # multiple models -> fusion of results.
         y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models])
         y_pred_emb_fusion = run_speaker_verification_task(working_dir, models)

From 3779765bc37883ef4cf576648bc6dd6789d0aa8a Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 13:41:49 +0900
Subject: [PATCH 6/9] 5 digits

---
 test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test.py b/test.py
index f31d90e..aa5a09a 100644
--- a/test.py
+++ b/test.py
@@ -7,7 +7,7 @@
 from audio import Audio
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
-from eval_metrics import evaluate2, evaluate
+from eval_metrics import evaluate
 from models import ResCNNModel, select_model_class
 from utils import score_fusion, embedding_fusion
 
@@ -39,9 +39,9 @@ def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate):
         fm_1, tpr_1, acc_1, eer_1 = eval(y_pred_score_fusion, y_true)
         fm_2, tpr_2, acc_2, eer_2 = eval(y_pred_emb_fusion, y_true)
         logger.info(f'[score fusion] f-measure = {fm_1:.5f}, true positive rate = {tpr_1:.5f}, '
-                    f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}')
+                    f'accuracy = {acc_1:.5f}, equal error rate = {eer_1:.5f}')
         logger.info(f'[emb fusion] f-measure = {fm_2:.5f}, true positive rate = {tpr_2:.5f}, '
-                    f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}')
+                    f'accuracy = {acc_2:.5f}, equal error rate = {eer_2:.5f}')
     else:
         y_pred = run_speaker_verification_task(working_dir, models)
         y_true = np.zeros_like(y_pred)  # positive is at index 0.

From 7001e93bed0e5a1990468dc0661df6b976f3a048 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 14:14:34 +0900
Subject: [PATCH 7/9] adjustment and more testing

---
 batcher.py | 3 +--
 test.py    | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/batcher.py b/batcher.py
index 9a8ad1c..b8ee77f 100644
--- a/batcher.py
+++ b/batcher.py
@@ -346,8 +346,7 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers,
             [sample_from_mfcc_file(u, self.max_length, seed) for u in negative_utterances]
         ])
 
-        batch_y = np.zeros(shape=(len(batch_x), 1))  # dummy. sparse softmax needs something.
-        return batch_x, batch_y
+        return batch_x
 
 
 class TripletBatcher:
diff --git a/test.py b/test.py
index aa5a09a..dd2846c 100644
--- a/test.py
+++ b/test.py
@@ -64,6 +64,8 @@ def run_speaker_verification_task(working_dir, model):
     for i, positive_speaker in tqdm(enumerate(audio.speaker_ids), desc='test', total=num_speakers):
         # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative.
         input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers, seed=i * seed)
+        input_data_2 = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers, seed=i * seed)
+        np.testing.assert_array_equal(input_data, input_data_2)
         # batch size is not relevant. just making sure we don't push too much on the GPU.
         if embeddings_fusion_cond:
             predictions = embedding_fusion(model[0].m.predict(input_data, batch_size=BATCH_SIZE),

From 74723319e0a1dac32b8d29d729501f65318a8de1 Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 15:03:37 +0900
Subject: [PATCH 8/9] small refactoring

---
 train.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index 209ec4f..fb181c9 100644
--- a/train.py
+++ b/train.py
@@ -21,9 +21,12 @@ def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRA
               batch_size: int = BATCH_SIZE, initial_epoch: int = 0):
     batcher = LazyTripletBatcher(working_dir, max_length, dsm)
 
+    steps_per_epoch = 2000  # arbitrary.
     # build small test set.
+    steps_train_test_ratio = 5
+
     test_batches = []
-    for _ in tqdm(range(200), desc='Build test set'):
+    for _ in tqdm(range(steps_per_epoch // steps_train_test_ratio), desc='Build test set'):
         test_batches.append(batcher.get_batch_test(batch_size))
 
     def test_generator():
@@ -33,13 +36,13 @@ def test_generator():
 
     def train_generator():
         while True:
-            yield batcher.get_random_batch(batch_size, is_test=False)
+            yield batcher.get_batch_train(batch_size)
 
     checkpoint_name = dsm.m.name + '_checkpoint'
     checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5')
     checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True)
     reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-4, verbose=1)
-    dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False,
+    dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=steps_per_epoch, shuffle=False,
               epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches),
               callbacks=[reduce_lr, checkpoint], initial_epoch=initial_epoch)
 
@@ -119,4 +122,3 @@ def start_training(working_dir, model_name, pre_training_phase=True):
         dsm.m.summary()
         dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss)
         fit_model(dsm, working_dir, max_length=NUM_FRAMES, initial_epoch=initial_epoch)
-

From 41d21364672b57d87da5d79f1706d0e121c370cf Mon Sep 17 00:00:00 2001
From: Philippe Remy <premy.enseirb@gmail.com>
Date: Thu, 30 Apr 2020 15:28:57 +0900
Subject: [PATCH 9/9] big refactoring

---
 audio.py        | 72 +++++++++++++++++++------------------------------
 batcher.py      | 13 +++++++--
 cli.py          | 17 +++++-------
 eval_metrics.py |  1 +
 example.py      |  5 ++--
 models.py       | 46 ++++++++++++++++---------------
 test.py         |  3 +--
 triplet_loss.py |  6 ++---
 utils.py        | 43 -----------------------------
 9 files changed, 77 insertions(+), 129 deletions(-)

diff --git a/audio.py b/audio.py
index e301362..f9d4800 100644
--- a/audio.py
+++ b/audio.py
@@ -14,23 +14,42 @@
 logger = logging.getLogger(__name__)
 
 
+def pad_mfcc(mfcc: np.array, max_length: int):
+    # pad MFCC with 0.0. if max_length = 160 (default settings), then less than 1.6s of speech will require padding.
+    if len(mfcc) < max_length:
+        mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
+    return mfcc
+
+
+def mfcc_fbank(signal: np.array, sample_rate: int):  # 1D signal array.
+    assert len(signal.shape) == 1
+    # Returns MFCC with shape (num_frames, n_filters, 3).
+    filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
+    frames_features = normalize_mfcc_frames(filter_banks)
+    # delta_1 = delta(filter_banks, N=1)
+    # delta_2 = delta(delta_1, N=1)
+    # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
+    return np.array(frames_features, dtype=np.float32)  # Float32 precision is enough here.
+
+
+def normalize_mfcc_frames(m: np.array, epsilon=1e-12):
+    return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
+
+
 def read_mfcc(input_filename, sample_rate):
     audio = Audio.read(input_filename, sample_rate)
+    # TODO: could use trim_silence() here or a better VAD.
     energy = np.abs(audio)
     silence_threshold = np.percentile(energy, 95)
     offsets = np.where(energy > silence_threshold)[0]
-    # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate  # frame_id to duration (ms)
-    # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate
-    # TODO: could use trim_silence() here or a better VAD.
     audio_voice_only = audio[offsets[0]:offsets[-1]]
     mfcc = mfcc_fbank(audio_voice_only, sample_rate)
     return mfcc
 
 
-def extract_speaker_and_utterance_ids(filename: str):  # LIBRI.
+def extract_speaker_and_utterance_ids(libri_filename: str):  # LIBRI.
     # 'audio/dev-other/116/288045/116-288045-0000.flac'
-    speaker, _, basename = Path(filename).parts[-3:]
-    filename.split('-')
+    speaker, _, basename = Path(libri_filename).parts[-3:]
     utterance = os.path.splitext(basename.split('-', 1)[-1])[0]
     assert basename.split('-')[0] == speaker
     return speaker, utterance
@@ -54,23 +73,6 @@ def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAM
     def speaker_ids(self):
         return sorted(self.speakers_to_utterances)
 
-    @staticmethod
-    def trim_silence(audio, threshold):
-        """Removes silence at the beginning and end of a sample."""
-        energy = librosa.feature.rms(audio)
-        frames = np.nonzero(np.array(energy > threshold))
-        indices = librosa.core.frames_to_samples(frames)[1]
-
-        # Note: indices can be an empty array, if the whole audio was silence.
-        audio_trim = audio[0:0]
-        left_blank = audio[0:0]
-        right_blank = audio[0:0]
-        if indices.size:
-            audio_trim = audio[indices[0]:indices[-1]]
-            left_blank = audio[:indices[0]]  # slice before.
-            right_blank = audio[indices[-1]:]  # slice after.
-        return audio_trim, left_blank, right_blank
-
     @staticmethod
     def read(filename, sample_rate=SAMPLE_RATE):
         audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32)
@@ -78,8 +80,8 @@ def read(filename, sample_rate=SAMPLE_RATE):
         return audio
 
     def build_cache(self, audio_dir, sample_rate):
-        logger.info(f'audio_dir: {audio_dir}.')
-        logger.info(f'sample_rate: {sample_rate:,} hz.')
+        logger.info(f'Audio directory : {audio_dir}.')
+        logger.info(f'Sample rate     : {sample_rate:,} hz.')
         audio_files = find_files(audio_dir, ext=self.ext)
         audio_files_count = len(audio_files)
         assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.'
@@ -98,23 +100,3 @@ def cache_audio_file(self, input_filename, sample_rate):
                 np.save(cache_filename, mfcc)
             except librosa.util.exceptions.ParameterError as e:
                 logger.error(e)
-
-
-def pad_mfcc(mfcc, max_length):  # num_frames, nfilt=64.
-    if len(mfcc) < max_length:
-        mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1))))
-    return mfcc
-
-
-def mfcc_fbank(signal: np.array, sample_rate: int):  # 1D signal array.
-    # Returns MFCC with shape (num_frames, n_filters, 3).
-    filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS)
-    frames_features = normalize_frames(filter_banks)
-    # delta_1 = delta(filter_banks, N=1)
-    # delta_2 = delta(delta_1, N=1)
-    # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0))
-    return np.array(frames_features, dtype=np.float32)  # Float32 precision is enough here.
-
-
-def normalize_frames(m, epsilon=1e-12):
-    return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m]
diff --git a/batcher.py b/batcher.py
index b8ee77f..3121203 100644
--- a/batcher.py
+++ b/batcher.py
@@ -10,9 +10,9 @@
 from tqdm import tqdm
 
 from audio import pad_mfcc, Audio
-from constants import NUM_FRAMES, NUM_FBANKS
+from constants import NUM_FRAMES, NUM_FBANKS, TRAIN_TEST_RATIO
 from models import DeepSpeakerModel
-from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt
+from utils import ensures_dir, load_pickle, load_npy
 
 logger = logging.getLogger(__name__)
 
@@ -36,6 +36,15 @@ def sample_from_mfcc_file(utterance_file, max_length, seed=None):
     return sample_from_mfcc(mfcc, max_length, seed)
 
 
+def train_test_sp_to_utt(audio, is_test):
+    sp_to_utt = {}
+    for speaker_id, utterances in audio.speakers_to_utterances.items():
+        utterances_files = sorted(utterances.values())
+        train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
+        sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
+    return sp_to_utt
+
+
 class KerasFormatConverter:
 
     def __init__(self, working_dir, load_test_only=False):
diff --git a/cli.py b/cli.py
index ceb7b4c..2f84842 100644
--- a/cli.py
+++ b/cli.py
@@ -18,14 +18,12 @@
 
 logger = logging.getLogger(__name__)
 
-VERSION = '3.0b'
+VERSION = '4.0a'
 
 
 @click.group()
 def cli():
-    logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s',
-                        level=logging.INFO,
-                        stream=sys.stdout)
+    logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO, stream=sys.stdout)
     init_pandas()
 
 
@@ -38,7 +36,7 @@ def version():
 @click.option('--working_dir', required=True, type=Ct.output_dir())
 @click.option('--audio_dir', default=None)
 @click.option('--sample_rate', default=SAMPLE_RATE, show_default=True, type=int)
-def build_audio_cache(working_dir, audio_dir, sample_rate):
+def build_audio_cache(working_dir: str, audio_dir: str, sample_rate: int):
     ensures_dir(working_dir)
     if audio_dir is None:
         audio_dir = os.path.join(working_dir, 'LibriSpeech')
@@ -48,7 +46,7 @@ def build_audio_cache(working_dir, audio_dir, sample_rate):
 @cli.command('build-keras-inputs', short_help='Build inputs to Keras.')
 @click.option('--working_dir', required=True, type=Ct.input_dir())
 @click.option('--counts_per_speaker', default='600,100', show_default=True, type=str)  # train,test
-def build_keras_inputs(working_dir, counts_per_speaker):
+def build_keras_inputs(working_dir: str, counts_per_speaker: str):
     counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')]
     kc = KerasFormatConverter(working_dir)
     kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker)
@@ -59,7 +57,7 @@ def build_keras_inputs(working_dir, counts_per_speaker):
 @click.option('--working_dir', required=True, type=Ct.input_dir())
 @click.option('--model_name', multiple=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
 @click.option('--checkpoint_file', multiple=True, required=True, type=Ct.input_file())
-def test_model(working_dir, model_name, checkpoint_file):
+def test_model(working_dir: str, model_name: tuple, checkpoint_file: tuple):
     # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model
     # --working_dir /home/philippe/ds-test/triplet-training/
     # --checkpoint_file ../ds-test/checkpoints-softmax/ResCNN_checkpoint_102.h5
@@ -77,11 +75,10 @@ def test_model(working_dir, model_name, checkpoint_file):
 @click.option('--working_dir', required=True, type=Ct.input_dir())
 @click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME]))
 @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True)
-def train_model(working_dir, model_name, pre_training_phase):
+def train_model(working_dir: str, model_name: str, pre_training_phase: bool):
     # PRE TRAINING
     # LibriSpeech train-clean-data360 (600, 100). 0.991 on test set (enough for pre-training).
-
-    # TRIPLET TRAINING
+    # TRIPLET TRAINING with ResCNN.
     # [...]
     # Epoch 175/1000
     # 2000/2000 [==============================] - 919s 459ms/step - loss: 0.0077 - val_loss: 0.0058
diff --git a/eval_metrics.py b/eval_metrics.py
index cdbc9b6..6f3df02 100644
--- a/eval_metrics.py
+++ b/eval_metrics.py
@@ -5,6 +5,7 @@
 
 
 def evaluate2(y_pred, y_true):
+    # TODO: still not perfect.
     fpr, tpr, threshold = roc_curve(y_true, y_pred, pos_label=1)
     fnr = 1 - tpr
     eer1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
diff --git a/example.py b/example.py
index 41391db..226dbb4 100644
--- a/example.py
+++ b/example.py
@@ -8,8 +8,9 @@
 from models import ResCNNModel
 from test import batch_cosine_similarity
 
-np.random.seed(123)
-random.seed(123)
+seed = 123
+np.random.seed(seed)
+random.seed(seed)
 
 model = ResCNNModel()
 model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True)
diff --git a/models.py b/models.py
index 03103cf..7954516 100644
--- a/models.py
+++ b/models.py
@@ -1,6 +1,7 @@
 import abc
 import logging
 
+import numpy as np
 import tensorflow.keras.backend as K
 from tensorflow.keras import layers
 from tensorflow.keras import regularizers
@@ -30,6 +31,24 @@ def select_model_class(name: str):
         raise Exception(f'Unknown model name: {name}.')
 
 
+def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array):
+    assert len(embeddings_1.shape) == 2  # (batch_size, 512).
+    assert embeddings_1.shape == embeddings_2.shape
+    embeddings_sum = embeddings_1 + embeddings_2
+    fusion = embeddings_sum / np.linalg.norm(embeddings_sum, ord=2, axis=1, keepdims=True)
+    assert np.all((-1 <= fusion) & (fusion <= 1))
+    assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6)
+    return fusion
+
+
+def score_fusion(scores_1: np.array, scores_2: np.array):
+    def normalize_scores(m, epsilon=1e-12):
+        return (m - np.mean(m)) / max(np.std(m), epsilon)
+
+    # score has to be between -1 and 1.
+    return np.tanh(np.sum(normalize_scores(np.stack((scores_1, scores_2), axis=2)), axis=2))
+
+
 class DeepSpeakerModel:
 
     def __init__(self,
@@ -68,26 +87,11 @@ def graph_with_avg_softmax_and_ln(self, inputs):
             x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
         return x
 
-    def keras_model(self):
-        return self.m
-
-    def get_weights(self):
-        w = self.m.get_weights()
-        if self.include_softmax:
-            w.pop()  # last 2 are the W_softmax and b_softmax.
-            w.pop()
-        return w
-
     def clipped_relu(self, inputs):
         relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
         self.clipped_relu_count += 1
         return relu
 
-    def set_weights(self, w):
-        for layer, layer_w in zip(self.m.layers, w):
-            layer.set_weights(layer_w)
-            logger.info(f'Setting weights for [{layer.name}]...')
-
 
 class ResCNNModel(DeepSpeakerModel):
 
@@ -137,7 +141,6 @@ def identity_block(self, input_tensor, kernel_size, filters, stage, block):
 
     def conv_and_res_block(self, inp, filters, stage):
         conv_name = 'conv{}-s'.format(filters)
-        # TODO: why kernel_regularizer?
         o = Conv2D(filters,
                    kernel_size=5,
                    strides=2,
@@ -163,8 +166,8 @@ def __init__(self,
     def graph(self, inputs):
         x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform',
                    name='conv1', kernel_regularizer=regularizers.l2(l=0.0001))(inputs)
-        # shape = (BATCH_SIZE , num_frames/2, 64/2, 64)
-        x = BatchNormalization(name='bn1')(x) # does it work with BN?
+        # shape = (batch_size , num_frames / 2, 64 / 2 = 32, 64)
+        x = BatchNormalization(name='bn1')(x)
         x = self.clipped_relu(x)
 
         # 4d -> 3d.
@@ -172,13 +175,12 @@ def graph(self, inputs):
         x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x)
         x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x)
 
-        # shape = (BATCH_SIZE , num_frames/2, 1024)
+        # shape = (batch_size, num_frames / 2, 1024)
         x = GRU(1024, name='GRU1', return_sequences=True)(x)
-        if self.include_softmax:
+        if self.include_softmax:  # to prevent over fitting during pre-training.
             x = Dropout(0.2)(x)
         x = GRU(1024, name='GRU2', return_sequences=True)(x)
-        if self.include_softmax:
+        if self.include_softmax:  # to prevent over fitting during pre-training.
             x = Dropout(0.2)(x)
         x = GRU(1024, name='GRU3', return_sequences=True)(x)
         return x
-
diff --git a/test.py b/test.py
index dd2846c..58ed214 100644
--- a/test.py
+++ b/test.py
@@ -8,8 +8,7 @@
 from batcher import LazyTripletBatcher
 from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE
 from eval_metrics import evaluate
-from models import ResCNNModel, select_model_class
-from utils import score_fusion, embedding_fusion
+from models import ResCNNModel, select_model_class, embedding_fusion, score_fusion
 
 logger = logging.getLogger(__name__)
 
diff --git a/triplet_loss.py b/triplet_loss.py
index 4179560..f2dd35f 100644
--- a/triplet_loss.py
+++ b/triplet_loss.py
@@ -1,6 +1,6 @@
 import keras.backend as K
 
-ALPHA = 0.1  # used in Deep Speaker.
+ALPHA = 0.1
 
 
 def batch_cosine_similarity(x1, x2):
@@ -13,8 +13,8 @@ def batch_cosine_similarity(x1, x2):
 
 def deep_speaker_loss(y_true, y_pred, alpha=ALPHA):
     # y_true is not used. we respect this convention:
-    # y_true.shape = (batch_size, embedding_size) [not used]
-    # y_pred.shape = (batch_size, embedding_size)
+    # y_true.shape = (batch_size * 3, embedding_size) [not used]
+    # y_pred.shape = (batch_size * 3, embedding_size)
     # EXAMPLE:
     # _____________________________________________________
     # ANCHOR 1 (512,)
diff --git a/utils.py b/utils.py
index b28c230..7a08323 100644
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import random
 import shutil
 from glob import glob
 
@@ -10,8 +9,6 @@
 import pandas as pd
 from natsort import natsorted
 
-from constants import TRAIN_TEST_RATIO
-
 logger = logging.getLogger(__name__)
 
 
@@ -32,10 +29,6 @@ def create_new_empty_dir(directory: str):
     os.makedirs(directory)
 
 
-def ensure_dir_for_filename(filename: str):
-    ensures_dir(os.path.dirname(filename))
-
-
 def ensures_dir(directory: str):
     if len(directory) > 0 and not os.path.exists(directory):
         os.makedirs(directory)
@@ -90,11 +83,6 @@ def delete_older_checkpoints(checkpoint_dir, max_to_keep=5):
             os.remove(checkpoint)
 
 
-def enable_deterministic(seed=123):
-    np.random.seed(seed)
-    random.seed(seed)
-
-
 def load_pickle(file):
     if not os.path.exists(file):
         return None
@@ -108,34 +96,3 @@ def load_npy(file):
         return None
     logger.info(f'Loading NPY file: {file}.')
     return np.load(file)
-
-
-def train_test_sp_to_utt(audio, is_test):
-    sp_to_utt = {}
-    for speaker_id, utterances in audio.speakers_to_utterances.items():
-        utterances_files = sorted(utterances.values())
-        train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO)
-        sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep]
-    return sp_to_utt
-
-
-def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array):
-    assert len(embeddings_1.shape) == 2  # (batch_size, 512).
-    assert embeddings_1.shape == embeddings_2.shape
-    embeddings_sum = embeddings_1 + embeddings_2
-    fusion = embeddings_sum / np.linalg.norm(embeddings_sum, ord=2, axis=1, keepdims=True)
-    assert np.all((-1 <= fusion) & (fusion <= 1))
-    assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6)
-    return fusion
-
-
-def score_fusion(scores_1: np.array, scores_2: np.array):
-    def normalize_scores(m, epsilon=1e-12):
-        return (m - np.mean(m)) / max(np.std(m), epsilon)
-
-    # score has to be between -1 and 1.
-    return np.tanh(np.sum(normalize_scores(np.stack((scores_1, scores_2), axis=2)), axis=2))
-
-
-if __name__ == '__main__':
-    score_fusion(np.random.uniform(low=-1, high=1, size=(5, 100)), np.random.uniform(low=-1, high=1, size=(5, 100)))