uniXerr/controller.py at master · wildonion/uniXerr · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242

# coding: utf-8

'''
	Codded By :

 █     █░ ██▓ ██▓    ▓█████▄  ▒█████   ███▄    █  ██▓ ▒█████   ███▄    █
▓█░ █ ░█░▓██▒▓██▒    ▒██▀ ██▌▒██▒  ██▒ ██ ▀█   █ ▓██▒▒██▒  ██▒ ██ ▀█   █
▒█░ █ ░█ ▒██▒▒██░    ░██   █▌▒██░  ██▒▓██  ▀█ ██▒▒██▒▒██░  ██▒▓██  ▀█ ██▒
░█░ █ ░█ ░██░▒██░    ░▓█▄   ▌▒██   ██░▓██▒  ▐▌██▒░██░▒██   ██░▓██▒  ▐▌██▒
░░██▒██▓ ░██░░██████▒░▒████▓ ░ ████▓▒░▒██░   ▓██░░██░░ ████▓▒░▒██░   ▓██


'''


from pathlib import Path
import typer
import numpy as np
import pandas as pd
import os
import time
import sys
from piper.loader import ClusteringDatasetLoader, ClassificationDatasetLoader
from infra.position_clustering.model import trainer as position_clustering_trainer
from infra.position_clustering.cluster import labels
from infra.position_classification.model import trainer as position_classification_trainer
from infra.position_classification.classifier import predictor


app = typer.Typer(help="【  uniXerr CLI controller  】")


# TODO : send a csv file for input data prediction (save it in server/dataset folder) and the type of labeled data for loading/training classifier model from uPC telegram bot
data_type = "raw"
labeled_csv_path = os.path.dirname(os.path.abspath(__file__)) + f'/server/dataset/pc_features_labeled-{data_type}.csv'
csv_input_data_for_classification = os.path.dirname(os.path.abspath(__file__))+'/server/dataset/input_data.csv'


@app.command()
def cluster_positions(
		 generate_fake_samples: bool = typer.Option(False, "--generate-fake-samples", help="Generating fake samples for training."),
		 epoch: int = typer.Option(3, help="Number of epoch for training VAE.", min=3, max=40),
		 batch_size: int = typer.Option(8, help="Number of batch size for training VAE.", min=4),
		 device: str = typer.Option('cpu', help="Training device. cpu or cuda"),
		 num_workers: int = typer.Option(4, help="Number of workers for pytroch dataloader object.", min=4),
		 latent_dim: int = typer.Option(2, help="Dimension of VAE latent space.", min=2, max=10),
		 ddo: bool = typer.Option(False, "--ddo", help="Force deletion with confirmation for dataloader object."),
		 dpm: bool = typer.Option(False, "--dpm", help="Force deletion with confirmation for pre-trained VAE model."),
		 cluster_on_latent: bool = typer.Option(True, "--cluster-on-raw-data", help="Clustering on pc_features dataset, default is set to VAE latent space"),
		 cluster_method: str = typer.Option('kmeans', help="Clustering method. kmeans or hdbscan; hdbscan is not suitable for latent space of VAE and has some drawbacks for new dataset."),
		 plot_method: str = typer.Option('pca', help="Plotting method for data. pca or tsne; if you want plot data before clustering on different methods just remove the pc_dataloader.pth with --ddo option.")

		 ):


	if device != 'cuda' and device != 'cpu':
		typer.secho("Please specify a correct device.", fg=typer.colors.RED, bold=True)
		sys.exit(1)

	if ddo:
		delete = typer.confirm("Are you sure you want to delete dataloader object?")
		if delete:
			typer.secho("\t➢   Deleting dataloader object!\n", fg=typer.colors.YELLOW, bold=True)
			try:
				os.remove(os.path.dirname(os.path.abspath(__file__))+'/server/dataset/pc_dataloader.pth')
			except:
				typer.secho("\t➢   Errot while deleting the file\n", fg=typer.colors.RED, bold=True)


	if dpm:
		delete = typer.confirm("Are you sure you want to delete pre-trained VAE model?")
		if delete:
			typer.secho("\t➢   Deleting pre-trained VAE model!\n", fg=typer.colors.YELLOW, bold=True)
			try:
				os.remove(os.path.dirname(os.path.abspath(__file__))+'/core/position_clustering/utils/pc_model_vae.pth')
			except:
				typer.secho("\t➢   Errot while deleting the file\n", fg=typer.colors.RED, bold=True)


	dataloader_kwargs = {'num_workers': num_workers, 'pin_memory': True} if device is 'cuda' else {}
	dataloader = ClusteringDatasetLoader(
							   batch_size=batch_size,
							   generate_fake_samples=generate_fake_samples,
							   plotting_kwargs=plot_method,
							   dataloader_kwargs=dataloader_kwargs
							   ) # build a dataloader object if there is no one, otherwise it'll load the saved object


	# train vae model if there is no pre-trained one, otherwise it'll load the saved model
	pc_model = position_clustering_trainer(data=dataloader(), device=device, latent_dim=latent_dim, epoch=epoch)
	latent = pc_model(data=dataloader().dataset.data) # get the latent space of dataset


	typer.secho("\n________VAE model state dict________\n", fg=typer.colors.MAGENTA, bold=True)
	for param_tensor in pc_model.vae_model.state_dict():
		print("\t➢  ",param_tensor, "\t\t", pc_model.vae_model.state_dict()[param_tensor].size())
	typer.secho(f"\n________the model________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"{pc_model.vae_model}", fg=typer.colors.RESET, bold=True)

	# print("\n________VAE model optimizer state dict________\n")
	# for var_name in pc_model.optimizer.state_dict():
	# 	print(var_name, "\t", pc_model.optimizer.state_dict()[var_name])
	typer.secho(f"\n________the optimizer________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"{pc_model.optimizer}", fg=typer.colors.RESET, bold=True)

	typer.secho("\n________VAE model last epoch saved________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"\t➢   current check point epoch : {pc_model.epoch}", fg=typer.colors.RESET, bold=True)

	typer.secho("\n________VAE model last training loss saved________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho("\t➢   current check point loss : {:.6f}".format(pc_model.loss), fg=typer.colors.RESET, bold=True)
	pc_model.plot_loss() # plot training loss


	typer.secho("\n________testing VAE model________\n", fg=typer.colors.MAGENTA, bold=True)
	sample_zero = dataloader().dataset.data[0]
	sample_zero_latent = pc_model(sample_zero)
	sample_zero_recons_decode_m = pc_model.decode(sample_zero_latent).data.numpy()
	sample_zero_recons_recons_m, mu, log_variance = pc_model.recons(sample_zero)
	typer.secho(f"\t➢   sample 0 of dataset : {sample_zero}", fg=typer.colors.RESET, bold=True)
	typer.secho(f"\t➢   getting the latent space of sample 0 : {sample_zero_latent}", fg=typer.colors.RESET, bold=True)
	typer.secho(f"\t➢   reconstructing the sample 0 from latent space using decode method : {sample_zero_recons_decode_m}", fg=typer.colors.RESET, bold=True)
	typer.secho(f"\t➢   reconstructing the sample 0 from latent space using recons method : {sample_zero_recons_recons_m.data.numpy()}", fg=typer.colors.RESET, bold=True)
	typer.secho(f"\t➢   mu : {mu.data.numpy()}", fg=typer.colors.RESET, bold=True) # mu is equals to the latent space cause we are not in training mode, in this case reparam method return mu
	typer.secho(f"\t➢   log variance : {log_variance.data.numpy()}", fg=typer.colors.RESET, bold=True)


	if cluster_on_latent:
		typer.secho("\n________Clustering on latent space of VAE model________\n", fg=typer.colors.MAGENTA, bold=True)
		cluster_ = labels(data=latent, data_type='latent', cluster_method=cluster_method)
		typer.secho("\n________latent space of VAE information________\n", fg=typer.colors.MAGENTA, bold=True)

	if not cluster_on_latent:
		typer.secho("\n________Clustering on pc_features raw dataset________\n", fg=typer.colors.MAGENTA, bold=True)
		cluster_ = labels(data=dataloader().dataset.get_raw(), data_type='raw', cluster_method=cluster_method)
		typer.secho("\n________pc_features raw data information during clustering________\n", fg=typer.colors.MAGENTA, bold=True)

	typer.secho(f"{cluster_.dataset_info()}\n", fg=typer.colors.RESET, bold=True) # dataset information during clustering
	cluster_.set() # export a csv of dataset with their labels
	cluster_.plot(method=plot_method) # plot the clustered data
	cluster_sample_label = cluster_[0] # get the cluster number for 0th sample of the dataset
	typer.secho("\n________credit information________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"\t➢   position for 0th sample of dataset is : {cluster_.get_position(cluster=cluster_sample_label)}\n", fg=typer.colors.RESET, bold=True)


@app.command()
def classify_positions(csv_path: Path = typer.Option(labeled_csv_path, help="Path to labeled pc_features csv dataset.",
					   exists=True, file_okay=True, dir_okay=False, writable=False, readable=True, resolve_path=True),
					   input_data_csv_path: Path = typer.Option(csv_input_data_for_classification, help="Path to input data csv for classification.",
					   exists=True, file_okay=True, dir_okay=False, writable=False, readable=True, resolve_path=True),
					   ddo: bool = typer.Option(False, "--ddo", help="Force deletion with confirmation for dataloader objects."),
					   dpm: bool = typer.Option(False, "--dpm", help="Force deletion with confirmation for pre-trained classifier model."),
					   epoch: int = typer.Option(200, help="Number of epoch for training classifier.", min=100, max=300),
					   batch_size: int = typer.Option(64, help="Number of batch size for training classifier.", min=16, max=256),
					   device: str = typer.Option('cpu', help="Training device. cpu or cuda"),
					   num_workers: int = typer.Option(4, help="Number of workers for pytroch dataloader object.", min=4),
				   ):


	if device != 'cuda' and device != 'cpu':
		typer.secho("Please specify a correct device.", fg=typer.colors.RED, bold=True)
		sys.exit(1)


	if ddo:
		delete = typer.confirm("Are you sure you want to delete dataloader objects?")
		if delete:
			typer.secho("\t➢   Deleting dataloader objects!\n", fg=typer.colors.YELLOW, bold=True)
			try:
				os.remove(os.path.dirname(os.path.abspath(__file__))+f'/server/dataset/pc_features_labeled_testing_tensors-{data_type}-DATALOADER.pth')
				os.remove(os.path.dirname(os.path.abspath(__file__))+f'/server/dataset/pc_features_labeled_training_tensors-{data_type}-DATALOADER.pth')
			except:
				typer.secho("\t➢   Errot while deleting the file\n", fg=typer.colors.RED, bold=True)

	if dpm:
		delete = typer.confirm("Are you sure you want to delete pre-trained classifier model?")
		if delete:
			typer.secho("\t➢   Deleting pre-trained classifier model!\n", fg=typer.colors.YELLOW, bold=True)
			try:
				os.remove(os.path.dirname(os.path.abspath(__file__))+f'/core/position_classification/utils/pc_model_classifier-{data_type}.pth')
				os.remove(os.path.dirname(os.path.abspath(__file__))+f'/core/position_classification/utils/classifier.obj')
			except:
				typer.secho("\t➢   Errot while deleting the file\n", fg=typer.colors.RED, bold=True)


	dataloader_kwargs = {'num_workers': num_workers, 'pin_memory': True} if device is 'cuda' else {}
	dataloader = ClassificationDatasetLoader(csv_path=csv_path, batch_size=batch_size, dataloader_kwargs=dataloader_kwargs) # build a dataloader objects for training and testing data if there is no one, otherwise it'll load the saved objects
	inputs, labels = iter(dataloader()[0]).next()
	positions = {label_index: chr(label_index+65) for label_index in range(labels.size(1))}
	pc_model = position_classification_trainer(device=device, epoch=epoch, data_type=data_type) # train and test classifier model if there is no pre-trained one
	pc_model(data=dataloader()) # dataloader()[0] is training pipeline and dataloader()[1] is testing pipeline
	pc_model_conf = {
					 "features"  : {"in": inputs.size(1), "out": labels.size(1)},
					 "positions" : positions,
					 "path"      : os.path.dirname(os.path.abspath(__file__)) + f'/core/position_classification/utils/pc_model_classifier-{data_type}.pth',
					 "device"    : device,
					 "data_type" : data_type
					 }
	classifier_ = predictor(**pc_model_conf) # it'll load the saved model and classify input data using the pre-trained one


	typer.secho("\n________classifier model state dict________\n", fg=typer.colors.MAGENTA, bold=True)
	for param_tensor in classifier_.model.state_dict():
		print("\t➢  ",param_tensor, "\t\t", classifier_.model.state_dict()[param_tensor].size())
	typer.secho(f"\n________the model________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"{classifier_.model}", fg=typer.colors.RESET, bold=True)

	# print("\n________classifier model optimizer state dict________\n")
	# for var_name in classifier_.optimizer.state_dict():
	# 	print(var_name, "\t", classifier_.optimizer.state_dict()[var_name])
	typer.secho(f"\n________the optimizer________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"{classifier_.optimizer}", fg=typer.colors.RESET, bold=True)

	typer.secho("\n________classifier model last epoch saved________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho(f"\t➢   current check point epoch : {classifier_.epoch}", fg=typer.colors.RESET, bold=True)

	typer.secho("\n________classifier model last training loss saved________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho("\t➢   current check point training loss : {:.6f}".format(classifier_.training_loss), fg=typer.colors.RESET, bold=True)

	typer.secho("\n________classifier model best training accuracy saved________\n", fg=typer.colors.MAGENTA, bold=True)
	typer.secho("\t➢   current check point training best accuracy : {:.6f}".format(classifier_.training_best_accuracy), fg=typer.colors.RESET, bold=True)


	# classify the input data using pre-trained classifier model
	# input data can be either a valid csv_path or a numpyndarray containing students' features
	# NOTE : classification of numpyndarray is only done using the api server through /user/classify/position route
	# NOTE : when we're doing classification on csv file we have to call both /users/add/info and /users/add/positions routes
	# TODO : call this function from uPC (rust script) app to classify your csv data in server/dataset folder sent from telegram app
	# TODO : send classified csv file(s) of input data csv according to the type of data using uPC app to telegram
	classifier_(input_data_csv_path)