From f1e0600d581a0756c80c8974522e87950dc63506 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Wed, 29 Apr 2020 17:14:07 +0900 Subject: [PATCH 1/9] fusion of two models --- batcher.py | 5 ++-- cli.py | 10 +++++-- eval_metrics.py | 5 ---- test.py | 77 ++++++++++++++++++++++++++++++++++--------------- train.py | 18 ++++++++---- utils.py | 14 +++++++++ 6 files changed, 90 insertions(+), 39 deletions(-) diff --git a/batcher.py b/batcher.py index 78fe129..c932ffc 100644 --- a/batcher.py +++ b/batcher.py @@ -147,8 +147,9 @@ def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel): self.history_model_inputs = None self.batch_count = 0 - for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history. - self.update_triplets_history() + if self.model is not None: + for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history. + self.update_triplets_history() def update_triplets_history(self): model_inputs = [] diff --git a/cli.py b/cli.py index e0a1a4b..ceb7b4c 100644 --- a/cli.py +++ b/cli.py @@ -3,6 +3,7 @@ import logging import os +import sys import click @@ -22,7 +23,9 @@ @click.group() def cli(): - logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO) + logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', + level=logging.INFO, + stream=sys.stdout) init_pandas() @@ -54,8 +57,8 @@ def build_keras_inputs(working_dir, counts_per_speaker): @cli.command('test-model', short_help='Test a Keras model.') @click.option('--working_dir', required=True, type=Ct.input_dir()) -@click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME])) -@click.option('--checkpoint_file', required=True, type=Ct.input_file()) +@click.option('--model_name', multiple=True, type=click.Choice([RES_CNN_NAME, GRU_NAME])) +@click.option('--checkpoint_file', multiple=True, required=True, type=Ct.input_file()) def test_model(working_dir, model_name, checkpoint_file): # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model # --working_dir /home/philippe/ds-test/triplet-training/ @@ -66,6 +69,7 @@ def test_model(working_dir, model_name, checkpoint_file): # --working_dir /home/philippe/ds-test/triplet-training/ # --checkpoint_file ../ds-test/checkpoints-triplets/ResCNN_checkpoint_175.h5 # f-measure = 0.849, true positive rate = 0.798, accuracy = 0.997, equal error rate = 0.025 + assert len(model_name) == len(checkpoint_file) test(working_dir, model_name, checkpoint_file) diff --git a/eval_metrics.py b/eval_metrics.py index f9ed8f2..c244b20 100644 --- a/eval_metrics.py +++ b/eval_metrics.py @@ -10,7 +10,6 @@ def evaluate(sims, labels): def calculate_roc(thresholds, sims, labels): - nrof_pairs = min(len(labels), len(sims)) nrof_thresholds = len(thresholds) tprs = np.zeros((nrof_thresholds)) @@ -18,10 +17,6 @@ def calculate_roc(thresholds, sims, labels): acc_train = np.zeros((nrof_thresholds)) precisions = np.zeros((nrof_thresholds)) fms = np.zeros((nrof_thresholds)) - accuracy = 0.0 - - indices = np.arange(nrof_pairs) - # Find the best threshold for the fold for threshold_idx, threshold in enumerate(thresholds): diff --git a/test.py b/test.py index 3e01af0..a2692d2 100644 --- a/test.py +++ b/test.py @@ -1,4 +1,5 @@ import logging +from typing import List import numpy as np from tqdm import tqdm @@ -6,12 +7,15 @@ from audio import Audio from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE -from models import ResCNNModel, select_model_class from eval_metrics import evaluate -from utils import enable_deterministic +from models import ResCNNModel, select_model_class +from utils import enable_deterministic, score_fusion, embedding_fusion logger = logging.getLogger(__name__) +EMBEDDING_FUSION = 0 +SCORE_FUSION = 1 + def batch_cosine_similarity(x1, x2): # https://en.wikipedia.org/wiki/Cosine_similarity @@ -25,38 +29,63 @@ def batch_cosine_similarity(x1, x2): return s -def eval_model(working_dir: str, model: ResCNNModel): +def eval_models(working_dir: str, models: List[ResCNNModel]): + if len(models) > 1: # multiple models -> fusion of results. + y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models]) + y_pred_emb_fusion = run_speaker_verification_task(working_dir, models) + assert y_pred_score_fusion.shape == y_pred_emb_fusion.shape + y_true = np.zeros_like(y_pred_score_fusion) # positive is at index 0. + y_true[:, 0] = 1.0 + fm_1, tpr_1, acc_1, eer_1 = evaluate(y_pred_score_fusion, y_true) + fm_2, tpr_2, acc_2, eer_2 = evaluate(y_pred_emb_fusion, y_true) + logger.info(f'[score fusion] f-measure = {fm_1:.3f}, true positive rate = {tpr_1:.3f}, ' + f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}') + logger.info(f'[emb fusion] f-measure = {fm_2:.3f}, true positive rate = {tpr_2:.3f}, ' + f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}') + else: + y_pred = run_speaker_verification_task(working_dir, models) + y_true = np.zeros_like(y_pred) # positive is at index 0. + y_true[:, 0] = 1.0 + fm, tpr, acc, eer = evaluate(y_pred, y_true) + logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, ' + f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}') + + +def run_speaker_verification_task(working_dir, model): enable_deterministic() + embeddings_fusion_cond = isinstance(model, list) + if embeddings_fusion_cond: + assert len(model) == 2 audio = Audio(working_dir) - batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model) - speakers_list = list(audio.speakers_to_utterances.keys()) + batcher = LazyTripletBatcher(working_dir, NUM_FRAMES, model=None) num_negative_speakers = 99 - num_speakers = len(speakers_list) + num_speakers = len(audio.speaker_ids) y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1)) # negatives + positive - for i, positive_speaker in tqdm(enumerate(speakers_list), desc='test', total=num_speakers): + for i, positive_speaker in tqdm(enumerate(audio.speaker_ids), desc='test', total=num_speakers): # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative. input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers) # batch size is not relevant. just making sure we don't push too much on the GPU. - predictions = model.m.predict(input_data, batch_size=BATCH_SIZE) + if embeddings_fusion_cond: + predictions = embedding_fusion(model[0].m.predict(input_data, batch_size=BATCH_SIZE), + model[1].m.predict(input_data, batch_size=BATCH_SIZE)) + else: + predictions = model.m.predict(input_data, batch_size=BATCH_SIZE) anchor_embedding = predictions[0] for j, other_than_anchor_embedding in enumerate(predictions[1:]): # positive + negatives y_pred[i][j] = batch_cosine_similarity([anchor_embedding], [other_than_anchor_embedding])[0] - y_true = np.zeros_like(y_pred) # positive is at index 0. - y_true[:, 0] = 1.0 - fm, tpr, acc, eer = evaluate(y_pred, y_true) - return fm, tpr, acc, eer + return y_pred -def test(working_dir, model_name, checkpoint_file): +def test(working_dir, model_names: tuple, checkpoint_files: tuple): batch_input_shape = [None, NUM_FRAMES, NUM_FBANKS, 1] - dsm = select_model_class(model_name)(batch_input_shape) - if checkpoint_file is not None: - logger.info(f'Found checkpoint [{checkpoint_file}]. Loading weights...') - dsm.m.load_weights(checkpoint_file, by_name=True) - else: - logger.info(f'Could not find any checkpoint in {checkpoint_file}.') - exit(1) - - fm, tpr, acc, eer = eval_model(working_dir, model=dsm) - logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, ' - f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}') + models = [] + for checkpoint_file, model_name in zip(checkpoint_files, model_names): + dsm = select_model_class(model_name)(batch_input_shape) + if checkpoint_file is not None: + logger.info(f'Found checkpoint [{checkpoint_file}] for [{model_name}]. Loading weights...') + dsm.m.load_weights(checkpoint_file, by_name=True) + else: + logger.info(f'Could not find any checkpoint in {checkpoint_file}.') + exit(1) + models.append(dsm) + eval_models(working_dir, models) diff --git a/train.py b/train.py index c9f2695..50202b1 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,7 @@ from batcher import KerasFormatConverter, LazyTripletBatcher from constants import BATCH_SIZE, CHECKPOINTS_SOFTMAX_DIR, CHECKPOINTS_TRIPLET_DIR, NUM_FRAMES, NUM_FBANKS -from models import ResCNNModel, DeepSpeakerModel, select_model_class, RES_CNN_NAME +from models import DeepSpeakerModel, select_model_class, RES_CNN_NAME from triplet_loss import deep_speaker_loss from utils import load_best_checkpoint, ensures_dir @@ -17,7 +17,8 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, batch_size=BATCH_SIZE): +def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRAMES, + batch_size: int = BATCH_SIZE, initial_epoch: int = 0): batcher = LazyTripletBatcher(working_dir, max_length, dsm) # build small test set. @@ -37,9 +38,10 @@ def train_generator(): checkpoint_name = dsm.m.name + '_checkpoint' checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5') checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True) + reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-4, verbose=1) dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False, epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches), - callbacks=[checkpoint]) + callbacks=[reduce_lr, checkpoint], initial_epoch=initial_epoch) def fit_model_softmax(dsm: DeepSpeakerModel, kx_train, ky_train, kx_test, ky_test, @@ -83,7 +85,7 @@ def start_training(working_dir, model_name, pre_training_phase=True): num_speakers_softmax = len(kc.categorical_speakers.speaker_ids) dsm = model_class(batch_input_shape, include_softmax=True, num_speakers_softmax=num_speakers_softmax) # ResCNN can train with default Adam LR of 0.001. GRU is more sensitive. - lr = 0.001 if model_name == RES_CNN_NAME else 0.00005 + lr = 0.001 if model_name == RES_CNN_NAME else 0.00003 logger.info(f'Initial learning rate set to {lr}.') dsm.m.compile(optimizer=Adam(learning_rate=lr), loss='sparse_categorical_crossentropy', metrics=['accuracy']) pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) @@ -102,12 +104,18 @@ def start_training(working_dir, model_name, pre_training_phase=True): pre_training_checkpoint = load_best_checkpoint(CHECKPOINTS_SOFTMAX_DIR) if triplet_checkpoint is not None: logger.info(f'Loading triplet checkpoint: {triplet_checkpoint}.') + initial_epoch = int(triplet_checkpoint.split('/')[-1].split('.')[0].split('_')[-1]) dsm.m.load_weights(triplet_checkpoint) elif pre_training_checkpoint is not None: logger.info(f'Loading pre-training checkpoint: {pre_training_checkpoint}.') # If `by_name` is True, weights are loaded into layers only if they share the # same name. This is useful for fine-tuning or transfer-learning models where # some of the layers have changed. + initial_epoch = 0 dsm.m.load_weights(pre_training_checkpoint, by_name=True) + else: + initial_epoch = 0 + dsm.m.summary() + # TODO: should replace by SGD(learning_rate=0.05, momentum=0.99) dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) - fit_model(dsm, working_dir, NUM_FRAMES) + fit_model(dsm, working_dir, max_length=NUM_FRAMES, initial_epoch=initial_epoch) diff --git a/utils.py b/utils.py index 6eb35e1..b55b3d9 100644 --- a/utils.py +++ b/utils.py @@ -118,3 +118,17 @@ def train_test_sp_to_utt(audio, is_test): train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO) sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep] return sp_to_utt + + +def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array): + assert len(embeddings_1.shape) == 2 # (batch_size, 512). + assert embeddings_1.shape == embeddings_2.shape + fusion = np.linalg.norm(embeddings_1 + embeddings_2, ord=2, axis=1) + return fusion + + +def score_fusion(scores_1: np.array, scores_2: np.array): + def normalize_scores(m, epsilon=1e-12): + return (m - np.mean(m)) / max(np.std(m), epsilon) + + return normalize_scores(scores_1) + normalize_scores(scores_2) From aa6cfcdae07c86e3c78b4e4f7e68c0a47da7ceec Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 11:19:07 +0900 Subject: [PATCH 2/9] better fusion --- test.py | 17 ++++++++++------- utils.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/test.py b/test.py index a2692d2..5a31a51 100644 --- a/test.py +++ b/test.py @@ -30,7 +30,7 @@ def batch_cosine_similarity(x1, x2): def eval_models(working_dir: str, models: List[ResCNNModel]): - if len(models) > 1: # multiple models -> fusion of results. + if isinstance(models, list) and len(models) > 1: # multiple models -> fusion of results. y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models]) y_pred_emb_fusion = run_speaker_verification_task(working_dir, models) assert y_pred_score_fusion.shape == y_pred_emb_fusion.shape @@ -38,17 +38,17 @@ def eval_models(working_dir: str, models: List[ResCNNModel]): y_true[:, 0] = 1.0 fm_1, tpr_1, acc_1, eer_1 = evaluate(y_pred_score_fusion, y_true) fm_2, tpr_2, acc_2, eer_2 = evaluate(y_pred_emb_fusion, y_true) - logger.info(f'[score fusion] f-measure = {fm_1:.3f}, true positive rate = {tpr_1:.3f}, ' - f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}') - logger.info(f'[emb fusion] f-measure = {fm_2:.3f}, true positive rate = {tpr_2:.3f}, ' - f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}') + logger.info(f'[score fusion] f-measure = {fm_1:.5f}, true positive rate = {tpr_1:.5f}, ' + f'accuracy = {acc_1:.5f}, equal error rate = {eer_1:.5f}') + logger.info(f'[emb fusion] f-measure = {fm_2:.5f}, true positive rate = {tpr_2:.5f}, ' + f'accuracy = {acc_2:.5f}, equal error rate = {eer_2:.5f}') else: y_pred = run_speaker_verification_task(working_dir, models) y_true = np.zeros_like(y_pred) # positive is at index 0. y_true[:, 0] = 1.0 fm, tpr, acc, eer = evaluate(y_pred, y_true) - logger.info(f'f-measure = {fm:.3f}, true positive rate = {tpr:.3f}, ' - f'accuracy = {acc:.3f}, equal error rate = {eer:.3f}') + logger.info(f'[single] f-measure = {fm:.5f}, true positive rate = {tpr:.5f}, ' + f'accuracy = {acc:.5f}, equal error rate = {eer:.5f}') def run_speaker_verification_task(working_dir, model): @@ -89,3 +89,6 @@ def test(working_dir, model_names: tuple, checkpoint_files: tuple): exit(1) models.append(dsm) eval_models(working_dir, models) + if len(models) > 1: + for model in models: + eval_models(working_dir, model) diff --git a/utils.py b/utils.py index b55b3d9..8d7aa68 100644 --- a/utils.py +++ b/utils.py @@ -123,12 +123,20 @@ def train_test_sp_to_utt(audio, is_test): def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array): assert len(embeddings_1.shape) == 2 # (batch_size, 512). assert embeddings_1.shape == embeddings_2.shape - fusion = np.linalg.norm(embeddings_1 + embeddings_2, ord=2, axis=1) + embeddings_sum = embeddings_1 + embeddings_2 + fusion = embeddings_sum / np.linalg.norm(embeddings_sum, ord=2, axis=1, keepdims=True) + assert np.all((-1 <= fusion) & (fusion <= 1)) + assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6) return fusion - def score_fusion(scores_1: np.array, scores_2: np.array): def normalize_scores(m, epsilon=1e-12): return (m - np.mean(m)) / max(np.std(m), epsilon) - return normalize_scores(scores_1) + normalize_scores(scores_2) + # score has to be between -1 and 1. + return np.tanh(np.sum(normalize_scores(np.stack((scores_1, scores_2), axis=2)), axis=2)) + + +if __name__ == '__main__': + score_fusion(np.ones((5, 100)), np.ones((5, 100))) + From c535362af41ca238602a7c2ea1a2f8fd5daf035b Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 12:21:58 +0900 Subject: [PATCH 3/9] fix test+utils --- batcher.py | 9 ++++++--- eval_metrics.py | 26 ++++++++++++++++++++++++++ test.py | 20 ++++++++++---------- utils.py | 11 +++++------ 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/batcher.py b/batcher.py index c932ffc..145856a 100644 --- a/batcher.py +++ b/batcher.py @@ -12,7 +12,7 @@ from audio import pad_mfcc, Audio from constants import NUM_FRAMES, NUM_FBANKS from models import DeepSpeakerModel -from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt +from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt, enable_deterministic logger = logging.getLogger(__name__) @@ -317,14 +317,17 @@ def get_batch_train(self, batch_size): return batch_x, batch_y - def get_speaker_verification_data(self, anchor_speaker, num_different_speakers): - speakers = list(self.audio.speakers_to_utterances.keys()) + def get_speaker_verification_data(self, anchor_speaker, num_different_speakers, seed=123): + speakers = list(self.audio.speaker_ids) anchor_utterances = [] positive_utterances = [] negative_utterances = [] + np.random.seed(seed) negative_speakers = np.random.choice(list(set(speakers) - {anchor_speaker}), size=num_different_speakers) assert [negative_speaker != anchor_speaker for negative_speaker in negative_speakers] + np.random.seed(seed) pos_utterances = np.random.choice(self.sp_to_utt_test[anchor_speaker], 2, replace=False) + np.random.seed(seed) neg_utterances = [np.random.choice(self.sp_to_utt_test[neg], 1, replace=True)[0] for neg in negative_speakers] anchor_utterances.append(pos_utterances[0]) positive_utterances.append(pos_utterances[1]) diff --git a/eval_metrics.py b/eval_metrics.py index c244b20..7e42f6d 100644 --- a/eval_metrics.py +++ b/eval_metrics.py @@ -1,4 +1,30 @@ import numpy as np +from scipy.interpolate import interp1d +from scipy.optimize import brentq +from sklearn.metrics import roc_curve, f1_score, precision_score, accuracy_score + + +def evaluate2(y_pred, y_true): + fpr, tpr, threshold = roc_curve(y_true, y_pred, pos_label=1) + fnr = 1 - tpr + eer1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))] + eer2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))] + eer3 = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) + + fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) + + thresholds = np.arange(-1, 1, 0.01) + best_index = np.argmax([f1_score(y_true, y_pred > t) for t in thresholds]) + t = thresholds[best_index] + f1 = f1_score(y_true, y_pred > t) + precision = precision_score(y_true, y_pred > t) + # roc = roc_auc_score(y_true, y_pred > t) + acc = accuracy_score(y_true, y_pred > t) + # recall = recall_score(y_true, y_pred > t) + + assert abs(eer1 - eer2) <= 1e-2 + assert abs(eer2 - eer3) <= 1e-2 + return f1, precision, acc, eer1 def evaluate(sims, labels): diff --git a/test.py b/test.py index 5a31a51..fe75d72 100644 --- a/test.py +++ b/test.py @@ -7,9 +7,9 @@ from audio import Audio from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE -from eval_metrics import evaluate +from eval_metrics import evaluate2 from models import ResCNNModel, select_model_class -from utils import enable_deterministic, score_fusion, embedding_fusion +from utils import score_fusion, embedding_fusion logger = logging.getLogger(__name__) @@ -29,30 +29,30 @@ def batch_cosine_similarity(x1, x2): return s -def eval_models(working_dir: str, models: List[ResCNNModel]): +def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate2): if isinstance(models, list) and len(models) > 1: # multiple models -> fusion of results. y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models]) y_pred_emb_fusion = run_speaker_verification_task(working_dir, models) assert y_pred_score_fusion.shape == y_pred_emb_fusion.shape y_true = np.zeros_like(y_pred_score_fusion) # positive is at index 0. y_true[:, 0] = 1.0 - fm_1, tpr_1, acc_1, eer_1 = evaluate(y_pred_score_fusion, y_true) - fm_2, tpr_2, acc_2, eer_2 = evaluate(y_pred_emb_fusion, y_true) + fm_1, tpr_1, acc_1, eer_1 = eval(y_pred_score_fusion, y_true) + fm_2, tpr_2, acc_2, eer_2 = eval(y_pred_emb_fusion, y_true) logger.info(f'[score fusion] f-measure = {fm_1:.5f}, true positive rate = {tpr_1:.5f}, ' - f'accuracy = {acc_1:.5f}, equal error rate = {eer_1:.5f}') + f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}') logger.info(f'[emb fusion] f-measure = {fm_2:.5f}, true positive rate = {tpr_2:.5f}, ' - f'accuracy = {acc_2:.5f}, equal error rate = {eer_2:.5f}') + f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}') else: y_pred = run_speaker_verification_task(working_dir, models) y_true = np.zeros_like(y_pred) # positive is at index 0. y_true[:, 0] = 1.0 - fm, tpr, acc, eer = evaluate(y_pred, y_true) + fm, tpr, acc, eer = eval(y_pred, y_true) logger.info(f'[single] f-measure = {fm:.5f}, true positive rate = {tpr:.5f}, ' f'accuracy = {acc:.5f}, equal error rate = {eer:.5f}') def run_speaker_verification_task(working_dir, model): - enable_deterministic() + seed = 123 embeddings_fusion_cond = isinstance(model, list) if embeddings_fusion_cond: assert len(model) == 2 @@ -63,7 +63,7 @@ def run_speaker_verification_task(working_dir, model): y_pred = np.zeros(shape=(num_speakers, num_negative_speakers + 1)) # negatives + positive for i, positive_speaker in tqdm(enumerate(audio.speaker_ids), desc='test', total=num_speakers): # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative. - input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers) + input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers, seed=i * seed) # batch size is not relevant. just making sure we don't push too much on the GPU. if embeddings_fusion_cond: predictions = embedding_fusion(model[0].m.predict(input_data, batch_size=BATCH_SIZE), diff --git a/utils.py b/utils.py index 8d7aa68..b28c230 100644 --- a/utils.py +++ b/utils.py @@ -90,10 +90,9 @@ def delete_older_checkpoints(checkpoint_dir, max_to_keep=5): os.remove(checkpoint) -def enable_deterministic(): - print('Deterministic mode enabled.') - np.random.seed(123) - random.seed(123) +def enable_deterministic(seed=123): + np.random.seed(seed) + random.seed(seed) def load_pickle(file): @@ -129,6 +128,7 @@ def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array): assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6) return fusion + def score_fusion(scores_1: np.array, scores_2: np.array): def normalize_scores(m, epsilon=1e-12): return (m - np.mean(m)) / max(np.std(m), epsilon) @@ -138,5 +138,4 @@ def normalize_scores(m, epsilon=1e-12): if __name__ == '__main__': - score_fusion(np.ones((5, 100)), np.ones((5, 100))) - + score_fusion(np.random.uniform(low=-1, high=1, size=(5, 100)), np.random.uniform(low=-1, high=1, size=(5, 100))) From 2c260d7f1b7ab1bbe1757a33a7f30c3c67594108 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 12:24:55 +0900 Subject: [PATCH 4/9] seed --- batcher.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/batcher.py b/batcher.py index 145856a..bb56431 100644 --- a/batcher.py +++ b/batcher.py @@ -1,8 +1,8 @@ import json import logging import os +import random from collections import deque, Counter -from random import choice from time import time import dill @@ -12,7 +12,7 @@ from audio import pad_mfcc, Audio from constants import NUM_FRAMES, NUM_FBANKS from models import DeepSpeakerModel -from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt, enable_deterministic +from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt logger = logging.getLogger(__name__) @@ -21,18 +21,19 @@ def extract_speaker(utt_file): return utt_file.split('/')[-1].split('_')[0] -def sample_from_mfcc(mfcc, max_length): +def sample_from_mfcc(mfcc, max_length, seed=None): if mfcc.shape[0] >= max_length: - r = choice(range(0, len(mfcc) - max_length + 1)) + random.seed(seed) + r = random.choice(range(0, len(mfcc) - max_length + 1)) s = mfcc[r:r + max_length] else: s = pad_mfcc(mfcc, max_length) return np.expand_dims(s, axis=-1) -def sample_from_mfcc_file(utterance_file, max_length): +def sample_from_mfcc_file(utterance_file, max_length, seed): mfcc = np.load(utterance_file) - return sample_from_mfcc(mfcc, max_length) + return sample_from_mfcc(mfcc, max_length, seed) class KerasFormatConverter: @@ -340,9 +341,9 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers, [extract_speaker(s) for s in anc_pos[1, :]])) batch_x = np.vstack([ - [sample_from_mfcc_file(u, self.max_length) for u in anchor_utterances], - [sample_from_mfcc_file(u, self.max_length) for u in positive_utterances], - [sample_from_mfcc_file(u, self.max_length) for u in negative_utterances] + [sample_from_mfcc_file(u, self.max_length, seed) for u in anchor_utterances], + [sample_from_mfcc_file(u, self.max_length, seed) for u in positive_utterances], + [sample_from_mfcc_file(u, self.max_length, seed) for u in negative_utterances] ]) batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something. From 0c18c272b544efbee7c524d41236e438fcf3a2f6 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 13:05:06 +0900 Subject: [PATCH 5/9] seed --- batcher.py | 2 +- eval_metrics.py | 2 -- test.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/batcher.py b/batcher.py index bb56431..9a8ad1c 100644 --- a/batcher.py +++ b/batcher.py @@ -31,7 +31,7 @@ def sample_from_mfcc(mfcc, max_length, seed=None): return np.expand_dims(s, axis=-1) -def sample_from_mfcc_file(utterance_file, max_length, seed): +def sample_from_mfcc_file(utterance_file, max_length, seed=None): mfcc = np.load(utterance_file) return sample_from_mfcc(mfcc, max_length, seed) diff --git a/eval_metrics.py b/eval_metrics.py index 7e42f6d..cdbc9b6 100644 --- a/eval_metrics.py +++ b/eval_metrics.py @@ -11,8 +11,6 @@ def evaluate2(y_pred, y_true): eer2 = fnr[np.nanargmin(np.absolute((fnr - fpr)))] eer3 = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.) - fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) - thresholds = np.arange(-1, 1, 0.01) best_index = np.argmax([f1_score(y_true, y_pred > t) for t in thresholds]) t = thresholds[best_index] diff --git a/test.py b/test.py index fe75d72..f31d90e 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ from audio import Audio from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE -from eval_metrics import evaluate2 +from eval_metrics import evaluate2, evaluate from models import ResCNNModel, select_model_class from utils import score_fusion, embedding_fusion @@ -29,7 +29,7 @@ def batch_cosine_similarity(x1, x2): return s -def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate2): +def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate): if isinstance(models, list) and len(models) > 1: # multiple models -> fusion of results. y_pred_score_fusion = score_fusion(*[run_speaker_verification_task(working_dir, m) for m in models]) y_pred_emb_fusion = run_speaker_verification_task(working_dir, models) From 3779765bc37883ef4cf576648bc6dd6789d0aa8a Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 13:41:49 +0900 Subject: [PATCH 6/9] 5 digits --- test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test.py b/test.py index f31d90e..aa5a09a 100644 --- a/test.py +++ b/test.py @@ -7,7 +7,7 @@ from audio import Audio from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE -from eval_metrics import evaluate2, evaluate +from eval_metrics import evaluate from models import ResCNNModel, select_model_class from utils import score_fusion, embedding_fusion @@ -39,9 +39,9 @@ def eval_models(working_dir: str, models: List[ResCNNModel], eval=evaluate): fm_1, tpr_1, acc_1, eer_1 = eval(y_pred_score_fusion, y_true) fm_2, tpr_2, acc_2, eer_2 = eval(y_pred_emb_fusion, y_true) logger.info(f'[score fusion] f-measure = {fm_1:.5f}, true positive rate = {tpr_1:.5f}, ' - f'accuracy = {acc_1:.3f}, equal error rate = {eer_1:.3f}') + f'accuracy = {acc_1:.5f}, equal error rate = {eer_1:.5f}') logger.info(f'[emb fusion] f-measure = {fm_2:.5f}, true positive rate = {tpr_2:.5f}, ' - f'accuracy = {acc_2:.3f}, equal error rate = {eer_2:.3f}') + f'accuracy = {acc_2:.5f}, equal error rate = {eer_2:.5f}') else: y_pred = run_speaker_verification_task(working_dir, models) y_true = np.zeros_like(y_pred) # positive is at index 0. From 7001e93bed0e5a1990468dc0661df6b976f3a048 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 14:14:34 +0900 Subject: [PATCH 7/9] adjustment and more testing --- batcher.py | 3 +-- test.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/batcher.py b/batcher.py index 9a8ad1c..b8ee77f 100644 --- a/batcher.py +++ b/batcher.py @@ -346,8 +346,7 @@ def get_speaker_verification_data(self, anchor_speaker, num_different_speakers, [sample_from_mfcc_file(u, self.max_length, seed) for u in negative_utterances] ]) - batch_y = np.zeros(shape=(len(batch_x), 1)) # dummy. sparse softmax needs something. - return batch_x, batch_y + return batch_x class TripletBatcher: diff --git a/test.py b/test.py index aa5a09a..dd2846c 100644 --- a/test.py +++ b/test.py @@ -64,6 +64,8 @@ def run_speaker_verification_task(working_dir, model): for i, positive_speaker in tqdm(enumerate(audio.speaker_ids), desc='test', total=num_speakers): # convention id[0] is anchor speaker, id[1] is positive, id[2:] are negative. input_data = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers, seed=i * seed) + input_data_2 = batcher.get_speaker_verification_data(positive_speaker, num_negative_speakers, seed=i * seed) + np.testing.assert_array_equal(input_data, input_data_2) # batch size is not relevant. just making sure we don't push too much on the GPU. if embeddings_fusion_cond: predictions = embedding_fusion(model[0].m.predict(input_data, batch_size=BATCH_SIZE), From 74723319e0a1dac32b8d29d729501f65318a8de1 Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 15:03:37 +0900 Subject: [PATCH 8/9] small refactoring --- train.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index 209ec4f..fb181c9 100644 --- a/train.py +++ b/train.py @@ -21,9 +21,12 @@ def fit_model(dsm: DeepSpeakerModel, working_dir: str, max_length: int = NUM_FRA batch_size: int = BATCH_SIZE, initial_epoch: int = 0): batcher = LazyTripletBatcher(working_dir, max_length, dsm) + steps_per_epoch = 2000 # arbitrary. # build small test set. + steps_train_test_ratio = 5 + test_batches = [] - for _ in tqdm(range(200), desc='Build test set'): + for _ in tqdm(range(steps_per_epoch // steps_train_test_ratio), desc='Build test set'): test_batches.append(batcher.get_batch_test(batch_size)) def test_generator(): @@ -33,13 +36,13 @@ def test_generator(): def train_generator(): while True: - yield batcher.get_random_batch(batch_size, is_test=False) + yield batcher.get_batch_train(batch_size) checkpoint_name = dsm.m.name + '_checkpoint' checkpoint_filename = os.path.join(CHECKPOINTS_TRIPLET_DIR, checkpoint_name + '_{epoch}.h5') checkpoint = ModelCheckpoint(monitor='val_loss', filepath=checkpoint_filename, save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-4, verbose=1) - dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=2000, shuffle=False, + dsm.m.fit(x=train_generator(), y=None, steps_per_epoch=steps_per_epoch, shuffle=False, epochs=1000, validation_data=test_generator(), validation_steps=len(test_batches), callbacks=[reduce_lr, checkpoint], initial_epoch=initial_epoch) @@ -119,4 +122,3 @@ def start_training(working_dir, model_name, pre_training_phase=True): dsm.m.summary() dsm.m.compile(optimizer=SGD(), loss=deep_speaker_loss) fit_model(dsm, working_dir, max_length=NUM_FRAMES, initial_epoch=initial_epoch) - From 41d21364672b57d87da5d79f1706d0e121c370cf Mon Sep 17 00:00:00 2001 From: Philippe Remy Date: Thu, 30 Apr 2020 15:28:57 +0900 Subject: [PATCH 9/9] big refactoring --- audio.py | 72 +++++++++++++++++++------------------------------ batcher.py | 13 +++++++-- cli.py | 17 +++++------- eval_metrics.py | 1 + example.py | 5 ++-- models.py | 46 ++++++++++++++++--------------- test.py | 3 +-- triplet_loss.py | 6 ++--- utils.py | 43 ----------------------------- 9 files changed, 77 insertions(+), 129 deletions(-) diff --git a/audio.py b/audio.py index e301362..f9d4800 100644 --- a/audio.py +++ b/audio.py @@ -14,23 +14,42 @@ logger = logging.getLogger(__name__) +def pad_mfcc(mfcc: np.array, max_length: int): + # pad MFCC with 0.0. if max_length = 160 (default settings), then less than 1.6s of speech will require padding. + if len(mfcc) < max_length: + mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1)))) + return mfcc + + +def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array. + assert len(signal.shape) == 1 + # Returns MFCC with shape (num_frames, n_filters, 3). + filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS) + frames_features = normalize_mfcc_frames(filter_banks) + # delta_1 = delta(filter_banks, N=1) + # delta_2 = delta(delta_1, N=1) + # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0)) + return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here. + + +def normalize_mfcc_frames(m: np.array, epsilon=1e-12): + return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m] + + def read_mfcc(input_filename, sample_rate): audio = Audio.read(input_filename, sample_rate) + # TODO: could use trim_silence() here or a better VAD. energy = np.abs(audio) silence_threshold = np.percentile(energy, 95) offsets = np.where(energy > silence_threshold)[0] - # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms) - # right_blank_duration_ms = (1000.0 * (len(audio) - offsets[-1])) // self.sample_rate - # TODO: could use trim_silence() here or a better VAD. audio_voice_only = audio[offsets[0]:offsets[-1]] mfcc = mfcc_fbank(audio_voice_only, sample_rate) return mfcc -def extract_speaker_and_utterance_ids(filename: str): # LIBRI. +def extract_speaker_and_utterance_ids(libri_filename: str): # LIBRI. # 'audio/dev-other/116/288045/116-288045-0000.flac' - speaker, _, basename = Path(filename).parts[-3:] - filename.split('-') + speaker, _, basename = Path(libri_filename).parts[-3:] utterance = os.path.splitext(basename.split('-', 1)[-1])[0] assert basename.split('-')[0] == speaker return speaker, utterance @@ -54,23 +73,6 @@ def __init__(self, cache_dir: str, audio_dir: str = None, sample_rate: int = SAM def speaker_ids(self): return sorted(self.speakers_to_utterances) - @staticmethod - def trim_silence(audio, threshold): - """Removes silence at the beginning and end of a sample.""" - energy = librosa.feature.rms(audio) - frames = np.nonzero(np.array(energy > threshold)) - indices = librosa.core.frames_to_samples(frames)[1] - - # Note: indices can be an empty array, if the whole audio was silence. - audio_trim = audio[0:0] - left_blank = audio[0:0] - right_blank = audio[0:0] - if indices.size: - audio_trim = audio[indices[0]:indices[-1]] - left_blank = audio[:indices[0]] # slice before. - right_blank = audio[indices[-1]:] # slice after. - return audio_trim, left_blank, right_blank - @staticmethod def read(filename, sample_rate=SAMPLE_RATE): audio, sr = librosa.load(filename, sr=sample_rate, mono=True, dtype=np.float32) @@ -78,8 +80,8 @@ def read(filename, sample_rate=SAMPLE_RATE): return audio def build_cache(self, audio_dir, sample_rate): - logger.info(f'audio_dir: {audio_dir}.') - logger.info(f'sample_rate: {sample_rate:,} hz.') + logger.info(f'Audio directory : {audio_dir}.') + logger.info(f'Sample rate : {sample_rate:,} hz.') audio_files = find_files(audio_dir, ext=self.ext) audio_files_count = len(audio_files) assert audio_files_count != 0, f'Could not find any {self.ext} files in {audio_dir}.' @@ -98,23 +100,3 @@ def cache_audio_file(self, input_filename, sample_rate): np.save(cache_filename, mfcc) except librosa.util.exceptions.ParameterError as e: logger.error(e) - - -def pad_mfcc(mfcc, max_length): # num_frames, nfilt=64. - if len(mfcc) < max_length: - mfcc = np.vstack((mfcc, np.tile(np.zeros(mfcc.shape[1]), (max_length - len(mfcc), 1)))) - return mfcc - - -def mfcc_fbank(signal: np.array, sample_rate: int): # 1D signal array. - # Returns MFCC with shape (num_frames, n_filters, 3). - filter_banks, energies = fbank(signal, samplerate=sample_rate, nfilt=NUM_FBANKS) - frames_features = normalize_frames(filter_banks) - # delta_1 = delta(filter_banks, N=1) - # delta_2 = delta(delta_1, N=1) - # frames_features = np.transpose(np.stack([filter_banks, delta_1, delta_2]), (1, 2, 0)) - return np.array(frames_features, dtype=np.float32) # Float32 precision is enough here. - - -def normalize_frames(m, epsilon=1e-12): - return [(v - np.mean(v)) / max(np.std(v), epsilon) for v in m] diff --git a/batcher.py b/batcher.py index b8ee77f..3121203 100644 --- a/batcher.py +++ b/batcher.py @@ -10,9 +10,9 @@ from tqdm import tqdm from audio import pad_mfcc, Audio -from constants import NUM_FRAMES, NUM_FBANKS +from constants import NUM_FRAMES, NUM_FBANKS, TRAIN_TEST_RATIO from models import DeepSpeakerModel -from utils import ensures_dir, load_pickle, load_npy, train_test_sp_to_utt +from utils import ensures_dir, load_pickle, load_npy logger = logging.getLogger(__name__) @@ -36,6 +36,15 @@ def sample_from_mfcc_file(utterance_file, max_length, seed=None): return sample_from_mfcc(mfcc, max_length, seed) +def train_test_sp_to_utt(audio, is_test): + sp_to_utt = {} + for speaker_id, utterances in audio.speakers_to_utterances.items(): + utterances_files = sorted(utterances.values()) + train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO) + sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep] + return sp_to_utt + + class KerasFormatConverter: def __init__(self, working_dir, load_test_only=False): diff --git a/cli.py b/cli.py index ceb7b4c..2f84842 100644 --- a/cli.py +++ b/cli.py @@ -18,14 +18,12 @@ logger = logging.getLogger(__name__) -VERSION = '3.0b' +VERSION = '4.0a' @click.group() def cli(): - logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', - level=logging.INFO, - stream=sys.stdout) + logging.basicConfig(format='%(asctime)12s - %(levelname)s - %(message)s', level=logging.INFO, stream=sys.stdout) init_pandas() @@ -38,7 +36,7 @@ def version(): @click.option('--working_dir', required=True, type=Ct.output_dir()) @click.option('--audio_dir', default=None) @click.option('--sample_rate', default=SAMPLE_RATE, show_default=True, type=int) -def build_audio_cache(working_dir, audio_dir, sample_rate): +def build_audio_cache(working_dir: str, audio_dir: str, sample_rate: int): ensures_dir(working_dir) if audio_dir is None: audio_dir = os.path.join(working_dir, 'LibriSpeech') @@ -48,7 +46,7 @@ def build_audio_cache(working_dir, audio_dir, sample_rate): @cli.command('build-keras-inputs', short_help='Build inputs to Keras.') @click.option('--working_dir', required=True, type=Ct.input_dir()) @click.option('--counts_per_speaker', default='600,100', show_default=True, type=str) # train,test -def build_keras_inputs(working_dir, counts_per_speaker): +def build_keras_inputs(working_dir: str, counts_per_speaker: str): counts_per_speaker = [int(b) for b in counts_per_speaker.split(',')] kc = KerasFormatConverter(working_dir) kc.generate(max_length=NUM_FRAMES, counts_per_speaker=counts_per_speaker) @@ -59,7 +57,7 @@ def build_keras_inputs(working_dir, counts_per_speaker): @click.option('--working_dir', required=True, type=Ct.input_dir()) @click.option('--model_name', multiple=True, type=click.Choice([RES_CNN_NAME, GRU_NAME])) @click.option('--checkpoint_file', multiple=True, required=True, type=Ct.input_file()) -def test_model(working_dir, model_name, checkpoint_file): +def test_model(working_dir: str, model_name: tuple, checkpoint_file: tuple): # export CUDA_VISIBLE_DEVICES=0; python cli.py test-model # --working_dir /home/philippe/ds-test/triplet-training/ # --checkpoint_file ../ds-test/checkpoints-softmax/ResCNN_checkpoint_102.h5 @@ -77,11 +75,10 @@ def test_model(working_dir, model_name, checkpoint_file): @click.option('--working_dir', required=True, type=Ct.input_dir()) @click.option('--model_name', required=True, type=click.Choice([RES_CNN_NAME, GRU_NAME])) @click.option('--pre_training_phase/--no_pre_training_phase', default=False, show_default=True) -def train_model(working_dir, model_name, pre_training_phase): +def train_model(working_dir: str, model_name: str, pre_training_phase: bool): # PRE TRAINING # LibriSpeech train-clean-data360 (600, 100). 0.991 on test set (enough for pre-training). - - # TRIPLET TRAINING + # TRIPLET TRAINING with ResCNN. # [...] # Epoch 175/1000 # 2000/2000 [==============================] - 919s 459ms/step - loss: 0.0077 - val_loss: 0.0058 diff --git a/eval_metrics.py b/eval_metrics.py index cdbc9b6..6f3df02 100644 --- a/eval_metrics.py +++ b/eval_metrics.py @@ -5,6 +5,7 @@ def evaluate2(y_pred, y_true): + # TODO: still not perfect. fpr, tpr, threshold = roc_curve(y_true, y_pred, pos_label=1) fnr = 1 - tpr eer1 = fpr[np.nanargmin(np.absolute((fnr - fpr)))] diff --git a/example.py b/example.py index 41391db..226dbb4 100644 --- a/example.py +++ b/example.py @@ -8,8 +8,9 @@ from models import ResCNNModel from test import batch_cosine_similarity -np.random.seed(123) -random.seed(123) +seed = 123 +np.random.seed(seed) +random.seed(seed) model = ResCNNModel() model.m.load_weights('/Users/premy/deep-speaker/checkpoints/ResCNN_triplet_training_checkpoint_175.h5', by_name=True) diff --git a/models.py b/models.py index 03103cf..7954516 100644 --- a/models.py +++ b/models.py @@ -1,6 +1,7 @@ import abc import logging +import numpy as np import tensorflow.keras.backend as K from tensorflow.keras import layers from tensorflow.keras import regularizers @@ -30,6 +31,24 @@ def select_model_class(name: str): raise Exception(f'Unknown model name: {name}.') +def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array): + assert len(embeddings_1.shape) == 2 # (batch_size, 512). + assert embeddings_1.shape == embeddings_2.shape + embeddings_sum = embeddings_1 + embeddings_2 + fusion = embeddings_sum / np.linalg.norm(embeddings_sum, ord=2, axis=1, keepdims=True) + assert np.all((-1 <= fusion) & (fusion <= 1)) + assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6) + return fusion + + +def score_fusion(scores_1: np.array, scores_2: np.array): + def normalize_scores(m, epsilon=1e-12): + return (m - np.mean(m)) / max(np.std(m), epsilon) + + # score has to be between -1 and 1. + return np.tanh(np.sum(normalize_scores(np.stack((scores_1, scores_2), axis=2)), axis=2)) + + class DeepSpeakerModel: def __init__(self, @@ -68,26 +87,11 @@ def graph_with_avg_softmax_and_ln(self, inputs): x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x) return x - def keras_model(self): - return self.m - - def get_weights(self): - w = self.m.get_weights() - if self.include_softmax: - w.pop() # last 2 are the W_softmax and b_softmax. - w.pop() - return w - def clipped_relu(self, inputs): relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs) self.clipped_relu_count += 1 return relu - def set_weights(self, w): - for layer, layer_w in zip(self.m.layers, w): - layer.set_weights(layer_w) - logger.info(f'Setting weights for [{layer.name}]...') - class ResCNNModel(DeepSpeakerModel): @@ -137,7 +141,6 @@ def identity_block(self, input_tensor, kernel_size, filters, stage, block): def conv_and_res_block(self, inp, filters, stage): conv_name = 'conv{}-s'.format(filters) - # TODO: why kernel_regularizer? o = Conv2D(filters, kernel_size=5, strides=2, @@ -163,8 +166,8 @@ def __init__(self, def graph(self, inputs): x = Conv2D(64, kernel_size=5, strides=2, padding='same', kernel_initializer='glorot_uniform', name='conv1', kernel_regularizer=regularizers.l2(l=0.0001))(inputs) - # shape = (BATCH_SIZE , num_frames/2, 64/2, 64) - x = BatchNormalization(name='bn1')(x) # does it work with BN? + # shape = (batch_size , num_frames / 2, 64 / 2 = 32, 64) + x = BatchNormalization(name='bn1')(x) x = self.clipped_relu(x) # 4d -> 3d. @@ -172,13 +175,12 @@ def graph(self, inputs): x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x) x = Reshape((frames_dim, fbank_dim * conv_output_dim))(x) - # shape = (BATCH_SIZE , num_frames/2, 1024) + # shape = (batch_size, num_frames / 2, 1024) x = GRU(1024, name='GRU1', return_sequences=True)(x) - if self.include_softmax: + if self.include_softmax: # to prevent over fitting during pre-training. x = Dropout(0.2)(x) x = GRU(1024, name='GRU2', return_sequences=True)(x) - if self.include_softmax: + if self.include_softmax: # to prevent over fitting during pre-training. x = Dropout(0.2)(x) x = GRU(1024, name='GRU3', return_sequences=True)(x) return x - diff --git a/test.py b/test.py index dd2846c..58ed214 100644 --- a/test.py +++ b/test.py @@ -8,8 +8,7 @@ from batcher import LazyTripletBatcher from constants import NUM_FBANKS, NUM_FRAMES, BATCH_SIZE from eval_metrics import evaluate -from models import ResCNNModel, select_model_class -from utils import score_fusion, embedding_fusion +from models import ResCNNModel, select_model_class, embedding_fusion, score_fusion logger = logging.getLogger(__name__) diff --git a/triplet_loss.py b/triplet_loss.py index 4179560..f2dd35f 100644 --- a/triplet_loss.py +++ b/triplet_loss.py @@ -1,6 +1,6 @@ import keras.backend as K -ALPHA = 0.1 # used in Deep Speaker. +ALPHA = 0.1 def batch_cosine_similarity(x1, x2): @@ -13,8 +13,8 @@ def batch_cosine_similarity(x1, x2): def deep_speaker_loss(y_true, y_pred, alpha=ALPHA): # y_true is not used. we respect this convention: - # y_true.shape = (batch_size, embedding_size) [not used] - # y_pred.shape = (batch_size, embedding_size) + # y_true.shape = (batch_size * 3, embedding_size) [not used] + # y_pred.shape = (batch_size * 3, embedding_size) # EXAMPLE: # _____________________________________________________ # ANCHOR 1 (512,) diff --git a/utils.py b/utils.py index b28c230..7a08323 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,5 @@ import logging import os -import random import shutil from glob import glob @@ -10,8 +9,6 @@ import pandas as pd from natsort import natsorted -from constants import TRAIN_TEST_RATIO - logger = logging.getLogger(__name__) @@ -32,10 +29,6 @@ def create_new_empty_dir(directory: str): os.makedirs(directory) -def ensure_dir_for_filename(filename: str): - ensures_dir(os.path.dirname(filename)) - - def ensures_dir(directory: str): if len(directory) > 0 and not os.path.exists(directory): os.makedirs(directory) @@ -90,11 +83,6 @@ def delete_older_checkpoints(checkpoint_dir, max_to_keep=5): os.remove(checkpoint) -def enable_deterministic(seed=123): - np.random.seed(seed) - random.seed(seed) - - def load_pickle(file): if not os.path.exists(file): return None @@ -108,34 +96,3 @@ def load_npy(file): return None logger.info(f'Loading NPY file: {file}.') return np.load(file) - - -def train_test_sp_to_utt(audio, is_test): - sp_to_utt = {} - for speaker_id, utterances in audio.speakers_to_utterances.items(): - utterances_files = sorted(utterances.values()) - train_test_sep = int(len(utterances_files) * TRAIN_TEST_RATIO) - sp_to_utt[speaker_id] = utterances_files[train_test_sep:] if is_test else utterances_files[:train_test_sep] - return sp_to_utt - - -def embedding_fusion(embeddings_1: np.array, embeddings_2: np.array): - assert len(embeddings_1.shape) == 2 # (batch_size, 512). - assert embeddings_1.shape == embeddings_2.shape - embeddings_sum = embeddings_1 + embeddings_2 - fusion = embeddings_sum / np.linalg.norm(embeddings_sum, ord=2, axis=1, keepdims=True) - assert np.all((-1 <= fusion) & (fusion <= 1)) - assert np.all(abs(np.sum(fusion ** 2, axis=1) - 1) < 1e-6) - return fusion - - -def score_fusion(scores_1: np.array, scores_2: np.array): - def normalize_scores(m, epsilon=1e-12): - return (m - np.mean(m)) / max(np.std(m), epsilon) - - # score has to be between -1 and 1. - return np.tanh(np.sum(normalize_scores(np.stack((scores_1, scores_2), axis=2)), axis=2)) - - -if __name__ == '__main__': - score_fusion(np.random.uniform(low=-1, high=1, size=(5, 100)), np.random.uniform(low=-1, high=1, size=(5, 100)))