From a4e14ee710cbd4800b3c098e3d653d9ba8937edd Mon Sep 17 00:00:00 2001
From: spamz23 <diogo_silva30@hotmail.com>
Date: Sat, 13 Mar 2021 21:36:03 +0000
Subject: [PATCH] :hammer: Refactor Querys -Needs review!

---
 .gitignore          | 244 ++++++++++++++++++++
 active_learner.py   |   5 +-
 especificacoes.json |   8 +-
 main.py             |  12 +-
 query.py            | 550 +++++++++++++++++++++-----------------------
 requirements.txt    |   1 +
 6 files changed, 522 insertions(+), 298 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..822d3695
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,244 @@
+
+# Created by https://www.toptal.com/developers/gitignore/api/python,vscode,database,django
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,vscode,database,django
+
+.vscode
+### Database ###
+*.accdb
+*.db
+*.dbf
+*.mdb
+*.pdb
+*.sqlite3
+
+### Django ###
+*.log
+*.pot
+*.pyc
+__pycache__/
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+media
+
+# If your build process includes running collectstatic, then you probably don't need or want to include staticfiles/
+# in your Git repository. Update and uncomment the following line accordingly.
+# <django-project-name>/staticfiles/
+
+### Django.Python Stack ###
+# Byte-compiled / optimized / DLL files
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+
+# Translations
+*.mo
+
+# Django stuff:
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+doc/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#poetry.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+# .env
+.env/
+.venv/
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# operating system-related files
+*.DS_Store #file properties cache/storage on macOS
+Thumbs.db #thumbnail cache on Windows
+
+# profiling data
+.prof
+
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+
+# C extensions
+
+# Distribution / packaging
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+
+# Installer logs
+
+# Unit test / coverage reports
+
+# Translations
+
+# Django stuff:
+
+# Flask stuff:
+
+# Scrapy stuff:
+
+# Sphinx documentation
+
+# PyBuilder
+
+# Jupyter Notebook
+
+# IPython
+
+# pyenv
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+
+# poetry
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+
+# Celery stuff
+
+# SageMath parsed files
+
+# Environments
+# .env
+
+# Spyder project settings
+
+# Rope project settings
+
+# mkdocs documentation
+
+# mypy
+
+# Pyre type checker
+
+# pytype static type analyzer
+
+# operating system-related files
+
+# profiling data
+
+
+### vscode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# End of https://www.toptal.com/developers/gitignore/api/python,vscode,database,django
\ No newline at end of file
diff --git a/active_learner.py b/active_learner.py
index 9ccbf99c..34d3472d 100644
--- a/active_learner.py
+++ b/active_learner.py
@@ -3,6 +3,7 @@
 import matplotlib.pyplot as plt
 from modAL.models import ActiveLearner
 from keras.wrappers.scikit_learn import KerasClassifier
+
 from query import Query
 
 
@@ -30,8 +31,8 @@ def loop(
         y_test,
         X_unlabeled,
         accuracy_goal,
-    ) :
-    
+    ):
+
         # accuracy of model with initialize images
         model_accuracy = self.score(X_test, y_test, verbose=0)
         print("\nAccuracy after query {n}: {acc:0.4f}".format(n=0, acc=model_accuracy))
diff --git a/especificacoes.json b/especificacoes.json
index 114d9a0a..5f685ead 100644
--- a/especificacoes.json
+++ b/especificacoes.json
@@ -1,8 +1,8 @@
 {
-    "training_path":"C:\\Users\\jpmrs\\OneDrive\\Desktop\\UTAD\\Projeto de licenciatura\\Mucosa\\Datasets\\train",
-    "testing_path":"C:\\Users\\jpmrs\\OneDrive\\Desktop\\UTAD\\Projeto de licenciatura\\Mucosa\\Datasets\\test_set_complete",
-    "validation_path":"C:\\Users\\jpmrs\\OneDrive\\Desktop\\UTAD\\Projeto de licenciatura\\Mucosa\\Datasets\\validation",
-    "unlabeled_path":"C:\\Users\\jpmrs\\OneDrive\\Desktop\\UTAD\\Projeto de licenciatura\\Mucosa\\Datasets\\unlabeled",
+    "training_path":"Datasets/train",
+    "testing_path":"Datasets/test_set_complete",
+    "validation_path":"Datasets/validation",
+    "unlabeled_path":"Datasets/unlabeled",
     "build_fn":"create_cnn",
     "query_strategy":"RandomQuery",
     "accuracy":0.90,
diff --git a/main.py b/main.py
index f9a2dbb7..2261085a 100644
--- a/main.py
+++ b/main.py
@@ -1,4 +1,6 @@
-from query import RandomSampling,UncertaintySampling, ClusterBasedSampling,RepresentativeSampling,Uncertainty_With_Clustering_Sampling,Representative_With_Clustering_Sampling,Highest_Entropy__Clustering_Sampling,Uncertainty_With_Representative_sampling,Highest_Entropy__Uncertainty_Sampling
+from query import (
+    ClusterBasedSampling,
+)
 from keras.preprocessing.image import ImageDataGenerator
 from active_learner import ActiveLearner
 import os
@@ -13,6 +15,7 @@
 
 input_shape = (128, 128, 3)
 
+
 def create_cnn():
     model = models.Sequential()
     model.add(layers.Conv2D(32, (4, 4), activation="relu", input_shape=input_shape))
@@ -31,7 +34,8 @@ def create_cnn():
     )
     return model
 
-os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
+
+os.environ["TF_FORCE_GPU_A_LLOW_GROWTH"] = "true"
 
 # read file
 with open("especificacoes.json", "r") as myfile:
@@ -39,7 +43,7 @@ def create_cnn():
 # parse file
 obj = json.loads(data)
 
-datagen = ImageDataGenerator(rescale=1.0 / 255) 
+datagen = ImageDataGenerator(rescale=1.0 / 255)
 train_generator = datagen.flow_from_directory(
     str(obj["training_path"]),
     target_size=(128, 128),
@@ -76,7 +80,7 @@ def create_cnn():
 
 learner = ActiveLearner(
     locals()[obj["build_fn"]],
-    Highest_Entropy__Uncertainty_Sampling(X_unlabeled, int(obj["n_instances"])),
+    ClusterBasedSampling(int(obj["n_instances"])),
     X_initial,
     y_initial,
     verbose=0,
diff --git a/query.py b/query.py
index 86b85246..4fd20928 100644
--- a/query.py
+++ b/query.py
@@ -1,126 +1,125 @@
 import numpy as np
 import random
+from abc import ABC, abstractmethod
+import tensorflow as tf
+from keras import backend as K
+
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.metrics.pairwise import pairwise_distances
-from keras import backend as K
 from modAL.uncertainty import uncertainty_sampling
-from abc import ABC, abstractmethod
-import tensorflow as tf
+
 
 class Query(ABC):
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        self.unlabeled_data = unlabeled_data
+    """
+    Docs here
+    """
+
+    def __init__(
+        self,
+        n_instances: int,
+    ) -> None:
         self.n_instances = n_instances
 
     @abstractmethod
-    def __call__(self):
+    def __call__(self, classifier, X):
         """
         Docs
         """
 
+
 class RandomSampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        """
-        Docs here
-        """
-        super().__init__(unlabeled_data, n_instances)
-
-    def __call__(self, *args, **kwargs):
-        """
-        Docs here
-        """
-
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-
+    def __call__(self, classifier, X):
+        # Select random indices
         query_idx = np.random.choice(
-            range(len(self.unlabeled_data)), size=self.n_instances, replace=True
+            range(len(X)),
+            size=self.n_instances,
+            replace=True,
         )
-        return query_idx, self.unlabeled_data[query_idx]
+        # Return query indices and unlabeled data at those
+        # indices
+        return query_idx, X[query_idx]
+
 
 class UncertaintySampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        """
-        Docs here
-        """
-
-        super().__init__(unlabeled_data, n_instances)
-
-    def __call__(self,learner,X_pool,*args, **kwargs):
-        """
-        Docs here
-        """
-        self.unlabeled_data = X_pool
-
-        if learner is None:
-            raise ValueError("learner_data param is missing")
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-        
-        indices = uncertainty_sampling(
-        learner.estimator.model, self.unlabeled_data, n_instances=self.n_instances
+    def __call__(self, classifier, X):
+        # Select indices based on uncertainty
+        query_idx = uncertainty_sampling(
+            classifier,
+            X,
+            n_instances=self.n_instances,
         )
-        return indices, self.unlabeled_data[indices]
+        # Return query indices and unlabeled data at those
+        # indices
+        return query_idx, X[query_idx]
+
 
 class ClusterBasedSampling(Query):
-    
     """
     Docs here
     """
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        """
-        Docs here
-        """
 
-        super().__init__(unlabeled_data, n_instances)
-
-    def __call__(self,learner,X_pool, *args, **kwargs):
+    def __call__(self, classifier, X):
         """
         CLUSTERING BASED SAMPLING
         group images in n clusters and select randomly one image for each clusters
         """
-        
-        self.unlabeled_data = X_pool
-
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-        unlabeled_data = self.unlabeled_data.reshape(len(self.unlabeled_data), -1)
+        # TODO: Check this line
+        unlabeled_data = X.reshape(len(X), -1)
 
+        # Instantiate KMeans object
+        # With the number of clusters, equal
+        # to the number of instances
         kmeans = MiniBatchKMeans(n_clusters=self.n_instances, random_state=0)
+
+        # Fit the data
         kmeans.fit(unlabeled_data)
 
-        query_idx = self.GetOneIndexOfEachCluster(kmeans, self.n_instances)
-        unlabeled_data = unlabeled_data.reshape(len(unlabeled_data), 128, 128, 3)
-        return query_idx, unlabeled_data[query_idx]
+        # Get one index from each cluster
+        query_idx = self.get_one_index_from_each_cluster(kmeans)
 
-    def Random(self,list):
-        secure_random = random.SystemRandom()
-        return secure_random.choice(list)
+        return query_idx, X[query_idx]
+
+    def get_one_index_from_each_cluster(self, kmeans):
+        """
+        Selects a random point from each cluster.
+        The number of clusters is determined by `n_instances`.
+        """
+        # For each cluster:
+        # (1) Find all the points from X that are assigned to the cluster.
+        # (2) Choose 1 point from tese points randomly.
+        # The number of clusters is `self.n_instances`
+        return [
+            np.random.choice(np.where(kmeans.labels_ == i)[0].tolist(), size=1)
+            for i in range(self.n_instances)
+        ]
 
 
-    def GetOneIndexOfEachCluster(self,kmeans, n_clusters):
-        indices = np.zeros(n_clusters)
+# class OutlierSampling(Query):
+#     """
+#     Docs here
+#     """
 
-        for i in range(n_clusters):
-            lista = np.where(i == kmeans.labels_)[0]  # select images from one cluster/label
-            valor = self.Random(lista)
-            indices[i] = valor
+#     def __init__(self, n_instances: int, x_validation: np.ndarray) -> None:
+#         """
+#         Docs here
+#         """
+#         # Save validation data
+#         self.x_validation = x_validation
+#         super().__init__(n_instances)
 
-        return indices.astype(int)
+#     def __call__(self, classifier, X):
+#         # Get per-neuron scores from validation data
+#         validation_rankings = self.get_validation_rankings(
+#             classifier, self.X_validation
+#         )
 
 
 # class OutlierSampling(Query):
@@ -185,7 +184,7 @@ def GetOneIndexOfEachCluster(self,kmeans, n_clusters):
 #             query_idx.append(outlier[0])
 
 #         return query_idx, self.unlabeled_data[query_idx]
-        
+
 #     def get_rank(self,value, rankings):
 #         """get the rank of the value in an ordered array as a percentage
 
@@ -220,296 +219,271 @@ def GetOneIndexOfEachCluster(self,kmeans, n_clusters):
 #         return absolute_ranking
 
 #     def get_validation_rankings(self,model, validation_data):
-        # validation_rankings = (
-        #     []
-        # )  # 2D array, every neuron by ordered list of output on validation data per neuron
-        # v = 0
-        # for item in validation_data:
-
-        #     item = item[np.newaxis, ...]
-        #     # get logit of item
-        #     print(type(model.layers[0].input[0]))
-        #     print(model.layers[0].input[0])
-    
-        #     keras_function =  K.function([model.layers[0].input[0]],[model.layers[-1].output])
-        #     #keras_function = K.function([model.get_input_at(0)], [model.layers[-1].output])
-        #     neuron_outputs = keras_function([item, 1])
-
-        #     # initialize array if we haven't yet
-        #     if len(validation_rankings) == 0:
-        #         for output in neuron_outputs:
-        #             validation_rankings.append([0.0] * len(validation_data))
-
-        #     n = 0
-        #     for output in neuron_outputs:
-        #         validation_rankings[n][v] = output
-        #         n += 1
-
-        #     v += 1
-
-        # # Rank-order the validation scores
-        # v = 0
-        # for validation in validation_rankings:
-        #     validation.sort()
-        #     validation_rankings[v] = validation
-        #     v += 1
-
-        # return validation_rankings
+# validation_rankings = (
+#     []
+# )  # 2D array, every neuron by ordered list of output on validation data per neuron
+# v = 0
+# for item in validation_data:
+
+#     item = item[np.newaxis, ...]
+#     # get logit of item
+#     print(type(model.layers[0].input[0]))
+#     print(model.layers[0].input[0])
+
+#     keras_function =  K.function([model.layers[0].input[0]],[model.layers[-1].output])
+#     #keras_function = K.function([model.get_input_at(0)], [model.layers[-1].output])
+#     neuron_outputs = keras_function([item, 1])
+
+#     # initialize array if we haven't yet
+#     if len(validation_rankings) == 0:
+#         for output in neuron_outputs:
+#             validation_rankings.append([0.0] * len(validation_data))
+
+#     n = 0
+#     for output in neuron_outputs:
+#         validation_rankings[n][v] = output
+#         n += 1
+
+#     v += 1
+
+# # Rank-order the validation scores
+# v = 0
+# for validation in validation_rankings:
+#     validation.sort()
+#     validation_rankings[v] = validation
+#     v += 1
+
+# return validation_rankings
+
 
 class RepresentativeSampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
+    def __call__(self, classifier, X):
         """
         Docs here
         """
-        super().__init__(unlabeled_data, n_instances)
-
-    def __call__(self,learner,X_pool, *args, **kwargs):
-        """
-        REPRESENTATIVE SAMPLING
-        select n images calculating the representativity of each image between unlabeled and train images
-        """
-
-        self.unlabeled_data=X_pool
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
+        # Get training vector
+        (batch_size, length, height, depth) = classifier.X_training.shape
+        train_vector = classifier.X_training.reshape(
+            (
+                batch_size,
+                length * height * depth,
+            )
+        )
 
-        len, length, height, depth = learner.X_training.shape
-        vetor_train = learner.X_training.reshape((len, length * height * depth))
+        # Get unlabeled vector
+        (batch_size, length, height, depth) = classifier.X.shape
+        unlabeled_vector = X.reshape(
+            (
+                batch_size,
+                length * height * depth,
+            )
+        )
 
-        len, length, height, depth = self.unlabeled_data.shape
-        vetor_unlabeled =self.unlabeled_data.reshape((len, length * height * depth))
+        # Calculate similarities
+        train_similarity = pairwise_distances(
+            unlabeled_vector, train_vector, metric="cosine"
+        )
 
-        train_similarity = pairwise_distances(vetor_unlabeled, vetor_train, metric="cosine")
         unlabeled_similarity = pairwise_distances(
-            vetor_unlabeled, vetor_unlabeled, metric="cosine"
+            unlabeled_vector, unlabeled_vector, metric="cosine"
         )
-
-        representativity = {}
-        index = 0
-        for train_sim, unlabeled_sim in zip(train_similarity, unlabeled_similarity):
-            representativity[index] = np.mean(unlabeled_sim) - np.mean(train_sim)
-            index = index + 1
-
+        # TODO: Check this
         representativity = sorted(
-            representativity.items(), key=lambda x: x[1], reverse=True
+            (
+                np.mean(unlabeled_sim) - np.mean(train_sim)
+                for train_sim, unlabeled_sim in zip(
+                    train_similarity, unlabeled_similarity
+                )
+            ),
+            reverse=True,
         )
-        query_idx = []
-        for r in representativity[:self.n_instances:]:
-            query_idx.append(r[0])
+        # Select first n elements (`n_instances`)
+        # (Most representative ones)
+        query_idx = representativity[: self.n_instances]
 
-        return query_idx, self.unlabeled_data[query_idx]
+        return query_idx, X[query_idx]
 
-class Uncertainty_With_Clustering_Sampling(Query):
+
+class UncertaintyWithClusteringSampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
+    def __init__(self, n_instances: int) -> None:
         """
         Docs here
         """
-        self.uncertainty=UncertaintySampling(unlabeled_data, 500)
-        self.clustering=ClusterBasedSampling(unlabeled_data, n_instances)
-        super().__init__(unlabeled_data, n_instances)
-       
-    def __call__(self,learner,X_pool, *args, **kwargs):
+        self.uncertainty_sampling = UncertaintySampling(
+            n_instances=500,
+        )
+        self.clustering_based_sampling = ClusterBasedSampling(
+            n_instances=n_instances,
+        )
+        super().__init__(n_instances)
+
+    def __call__(self, classifier, X):
         """
         Least Confidence Sampling with Clustering-based Sampling
 
         Combining Uncertainty and Diversity sampling means applying one technique and then another.
         this allow select images in different positions of the boarder
         """
-        self.unlabeled_data = X_pool
-        if learner is None:
-            raise ValueError("Learner param is missing")
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-
-        indices, instancias = self.uncertainty.__call__(learner,self.unlabeled_data)
-        
-        query_idx, data = self.clustering.__call__(learner,self.unlabeled_data)
-        print(query_idx)
-        return query_idx, self.unlabeled_data[query_idx]
-
-class Representative_With_Clustering_Sampling(Query):
+        indices, _ = self.uncertainty_sampling.__call__(
+            classifier,
+            X,
+        )
+
+        query_idx, _ = self.clustering_based_sampling(
+            classifier,
+            X[indices],
+        )
+
+        return query_idx, X[query_idx]
+
+
+class RepresentativeWithClusteringSampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
+    def __init__(self, n_instances: int) -> None:
         """
         Docs here
         """
-        self.representative=RepresentativeSampling(unlabeled_data, 1)
-        super().__init__(unlabeled_data, n_instances)
-       
-    def __call__(self,learner,X_pool, *args, **kwargs):
+        self.representative_sampling = RepresentativeSampling(
+            n_instances=1,
+        )
+        super().__init__(n_instances)
+
+    def __call__(self, classifier, X):
         """
-        muito lento
+        Docs here
         """
-        
-        self.unlabeled_data=X_pool
-
-        if learner is None:
-            raise ValueError("Learner param is missing")
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-        
-        self.unlabeled_data = self.unlabeled_data.reshape(len(self.unlabeled_data), -1)
-
+        # TODO: Confirm this
+        # Instantiate clustering object
         kmeans = MiniBatchKMeans(n_clusters=self.n_instances, random_state=0)
-        kmeans.fit(self.unlabeled_data)
 
-        unlabeled_data = self.unlabeled_data.reshape(len(self.unlabeled_data), 128, 128, 3)
+        for x_batch in X:
+            # Partially fit each batch
+            kmeans.partial_fit(x_batch)
+
         indices = []
 
+        # Iterate over each cluster
         for i in range(self.n_instances):
-            lista = np.where(i == kmeans.labels_)[0]  # select images from one cluster/label
-            query_idx, unlabeled_data[query_idx] = self.representative.__call__(learner, unlabeled_data[lista])
-            indices.append(int(lista[query_idx]))
 
-        return indices, unlabeled_data[indices]
+            # Get cluster indices
+            cluster_indices = np.where(kmeans.labels_ == i)[0].tolist()
+
+            # Get most representative from each cluster
+            query_idx, X[query_idx] = self.representative_sampling.__call__(
+                classifier,
+                X[cluster_indices],
+            )
+
+            indices.append(int(cluster_indices[query_idx]))
 
-class Highest_Entropy__Clustering_Sampling(Query):
+        return query_idx, X[query_idx]
+
+
+class HighestEntropyClusteringSampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        """
-        Docs here
-        """
-        self.representative=RepresentativeSampling(unlabeled_data, 1)
-        self.n_clusters=10
-        super().__init__(unlabeled_data, n_instances)
-       
-    def __call__(self,learner,X_pool, *args, **kwargs):
+    def __init__(self, n_instances: int) -> None:
+        self.representative = RepresentativeSampling(n_instances=1)
+        self.n_clusters = 10
+        super().__init__(n_instances)
+
+    def __call__(self, classifier, X):
         """
         Sampling from the Highest Entropy Cluster
         Select n images from the cluster where the images have more entropy (average incertainty is bigger)
         """
 
-        self.unlabeled_data=X_pool
-
-        if learner is None:
-            raise ValueError("Learner param is missing")
-        if self.unlabeled_data is None:
-            raise ValueError("unlabled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-       
-        highest_average_Uncertainty = 1
-        self.unlabeled_data = self.unlabeled_data.reshape(len(self.unlabeled_data), -1)
-
+        # Intantiate clustering object
         kmeans = MiniBatchKMeans(n_clusters=self.n_clusters, random_state=0)
-        kmeans.fit(self.unlabeled_data)
 
-        self.unlabeled_data = self.unlabeled_data.reshape(len(self.unlabeled_data), 128, 128, 3)
+        for x_batch in X:
+            # Partially fit each batch
+            kmeans.partial_fit(x_batch)
+
+        clusters_average_uncertainty = []
 
+        # Iterate over each cluster
         for i in range(self.n_clusters):
-            lista = np.where(i == kmeans.labels_)[0]  # select images from cluster i
-            probabilidades = learner.predict_proba(self.unlabeled_data[lista])
-            incertezas = [abs(i[0] - i[1]) for i in probabilidades]
-            average_Uncertainty = np.mean(incertezas)
+            # Get cluster indices
+            cluster_indices = np.where(kmeans.labels_ == i)[0].tolist()
+            # Use the indices to compute probabilities
+            probabilities = classifier.predict_proba(X[cluster_indices])
+
+            # Compute uncertanties
+            uncertanties = [abs(i[0] - i[1]) for i in probabilities]
+            clusters_average_uncertainty.append(np.mean(uncertanties))
+
+        # Get the index of the most uncertain cluster
+        most_uncertain_cluster = np.argmax(clusters_average_uncertainty)
+        # Get indices from must uncertain cluster
+        cluster_indices = np.where(kmeans.labels_ == most_uncertain_cluster)[0].tolist()
 
-            if average_Uncertainty < highest_average_Uncertainty:
-                highest_average_Uncertainty = average_Uncertainty
-                most_uncertain_cluster = i
+        # Select random elements from cluster
+        query_idx = np.random.choice(cluster_indices, self.n_instances, replace=True)
 
-        lista = np.where(most_uncertain_cluster == kmeans.labels_)[0]  # select images from one cluster/label
-        
-        print(lista.shape)
-        indices = np.random.choice(lista, self.n_instances, replace=True)
+        return query_idx, X[query_idx]
 
-        return indices, self.unlabeled_data[indices]
 
-class Uncertainty_With_Representative_sampling(Query):
+class UncertaintyWithRepresentativeSampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        """
-        Docs here
-        """
-        self.uncertainty=UncertaintySampling(unlabeled_data,500)
-        self.representative=RepresentativeSampling(unlabeled_data,  n_instances)
-      
-        super().__init__(unlabeled_data, n_instances)
-       
-    def __call__(self,learner,X_pool, *args, **kwargs):
-        
-        self.unlabeled_data=X_pool
-
-        if learner is None:
-            raise ValueError("Learner param is missing")
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-        
-        indices, instancias = self.uncertainty.__call__(
-            learner,
-            self.unlabeled_data,
-            self.uncertainty.n_instances,
-        )
-        
-        query_idx, data = self.representative.__call__(
-            learner, instancias, self.representative.n_instances
+    def __init__(self, n_instances: int) -> None:
+        self.uncertainty_sampling = UncertaintySampling(n_instances=500)
+        self.representative_sampling = RepresentativeSampling(n_instances=n_instances)
+        super().__init__(n_instances)
+
+    def __call__(self, classifier, X):
+
+        # Get query idx from Uncertainty Sampling
+        _, instances = self.uncertainty_sampling.__call__(classifier, X)
+
+        # Use these previous instances in Representative Sampling
+        query_idx, instances = self.representative_sampling.__call__(
+            classifier, instances
         )
-        return indices[query_idx], data
 
-class Highest_Entropy__Uncertainty_Sampling(Query):
+        return query_idx, instances
+
+
+class HighestEntropyUncertaintySampling(Query):
     """
     Docs here
     """
 
-    def __init__(self, unlabeled_data: np.ndarray, n_instances: int) -> None:
-        """
-        Docs here
-        """
-
-        self.highest_entropy_clustering=Highest_Entropy__Clustering_Sampling(unlabeled_data,100)
-        self.uncertainty=UncertaintySampling(unlabeled_data,n_instances)
-      
-        super().__init__(unlabeled_data, n_instances)
-       
-    def __call__(self,learner,X_pool, *args, **kwargs):
-        
-
-        if learner is None:
-            raise ValueError("Learner param is missing")
-        if self.unlabeled_data is None:
-            raise ValueError("unlabeled_data param is missing")
-        if self.n_instances is None:
-            raise ValueError("n_instances param is missing")
-        
-        indices, instancias = self.highest_entropy_clustering.__call__(
-            learner,self.unlabeled_data,
+    def __init__(self, n_instances: int) -> None:
+        self.highest_entropy_clustering_sampling = HighestEntropyClusteringSampling(
+            n_instances=100
         )
-        
-        print(indices.shape)
-        query_idx, data = self.uncertainty.__call__(
-            learner, instancias, self.n_instances
-        )
-
-        print(query_idx.shape)
-        return indices[query_idx], data
-
+        self.uncertainty_sampling = UncertaintySampling(n_instances)
+        super().__init__(n_instances)
 
+    def __call__(self, classifier, X):
 
+        # Use highest entropy clustering first
+        (
+            entropy_clustering_indices,
+            instances,
+        ) = self.highest_entropy_clustering_sampling.__call__(classifier, X)
 
+        # Get the most uncertain ones
+        query_idx, instances = self.uncertainty_sampling.__call__(classifier, instances)
 
+        return (entropy_clustering_indices[query_idx], instances)
 
 
 # def Uncertainty_With_ModelOutliers_sampling(**kwargs):
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..8374642d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+modAL==0.4.1
\ No newline at end of file