-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodeling.py
More file actions
109 lines (99 loc) · 4.35 KB
/
modeling.py
File metadata and controls
109 lines (99 loc) · 4.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 12 20:17:17 2017
@author: naveen.nathan
"""
# Models:
# 1) Run word2vec on input text file
# 2) Combine unigrams to make meaningful bigrams. Then combine to make trigrams
# Purpose: To run word2vec model
# Input: Text file path
# Output: word2vec model object
def run_word2vec_model(text_file):
from gensim.models.word2vec import LineSentence, Word2Vec
sentences = LineSentence(text_file)
model = Word2Vec(sentences, sg=1, workers=5, size=100, min_count=2, window=5)
# model.build_vocab(sentences)
model.train(sentences, total_examples = model.corpus_count, epochs = 5)
# model.save("sample_model.w2v.bin")
return model
# Purpose: Creates meaningful bigrams and trigrams from tokens
# Input: List of tokens (unigrams)
# Output: List (unigrams, bigrams (unigram_unigram) and trigrams(unigram_unigram_unigram))
def apply_bigram_trigram_model(unigrams):
from gensim.models.phrases import Phrases , Phraser
phrases = Phrases(unigrams)
bigram = Phraser(phrases)
trigram = Phrases(bigram[unigrams])
trigram = Phraser(trigram)
return list(trigram[bigram[unigrams]])
# Purpose: Creates topics based on LDA model
# Input: Input text file
# Output: LDA model object
def run_lda_topic_model(text_file, num_topics = None):
from gensim.models.ldamodel import LdaModel
from gensim.corpora.textcorpus import TextCorpus
import gensim.corpora.dictionary as dic
from tokenization import tokenize_treetagger
from util import remove_stopwords, remove_punctuations, identify_num_lda_topics_with_hdp
from pandas import Series
corpus = TextCorpus(text_file)
text = open(text_file, 'r').read()
text = text.split("\n")
text = Series(text)
text = text.apply(tokenize_treetagger)
text = text.apply(remove_stopwords)
text = text.apply(remove_punctuations)
dictionary = dic.Dictionary(text)
corpus = [dictionary.doc2bow(sent) for sent in text]
if num_topics is None:
num_topics = identify_num_lda_topics_with_hdp(corpus, dictionary)
lda = LdaModel(corpus = corpus, id2word = dictionary, passes=5, random_state = 1, num_topics = num_topics)
return (lda, corpus, dictionary)
# Purpose: Creates Logistic Regression classification model
# Input: DataFrame of input columns and output column
# Output: Logistic Regression model object
def build_logistic_regression(df, outcome):
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty = 'l1')
model.fit(X = df.drop(outcome, axis=1), y = df[outcome])
return model
# Purpose: Runs affinity propogation for clustering based on given m x m distance matrix
# Input: m x m square matrix of distances
# Output: Affinity propogation model object. aff.labels_ gives cluster labels
def run_aff_prop_with_distances(distances):
from sklearn.cluster import AffinityPropagation
aff = AffinityPropagation(max_iter = 1000, affinity = 'precomputed')
aff.fit(distances)
return aff
# Purpose: Runs kmeans clustering
# Input: DataFrame with required variables
# Output: DataFrame with cluster ID in 'cluster' column
def run_kmeans(data, outfile_prefix = ""):
from sklearn.cluster import KMeans
from pickle import dump
from pandas import DataFrame
n_cluster = optimal_k_silhouette(data, [i+2 for i in range(9)])
kmeans = KMeans(init = 'k-means++', n_clusters = n_cluster, n_init = 10)
dump(kmeans, open(outfile_prefix + "kmeans_model.pkl", "wb"))
cluster_labels = kmeans.fit_predict(data)
data = DataFrame(data)
data['cluster'] = cluster_labels
return data
# Purpose: Silhouette criteria for automatic selection of number of clusters
# Input: DataFrame and range of number of clusters
# Output: Int (Optimal number of clusters)
def optimal_k_silhouette(data, range_n_clusters):
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
range_n_clusters = range_n_clusters
silhouette_max = -1
optimum_cluster = range_n_clusters[0]
for n_clusters in range_n_clusters:
kmeans = KMeans(n_clusters = n_clusters, random_state = 10, n_init = 10)
cluster_labels = kmeans.fit_predict(data)
silhouette_avg = silhouette_score(data, cluster_labels)
if silhouette_max < silhouette_avg :
silhouette_max = silhouette_avg
optimum_cluster = n_clusters
return optimum_cluster