-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSentAnalysis.java
More file actions
198 lines (165 loc) · 6.24 KB
/
SentAnalysis.java
File metadata and controls
198 lines (165 loc) · 6.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* Basic Sentiment Analysis
* SentAnalysis.java
* Emma Neary and Peter Mehler
* All group members were present and contributing during all work on this project.
* We did not give or recieve unauthorized aid on this assignment.
*/
import java.io.*;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
public class SentAnalysis {
final static String TRAINFOLDERNAME = "train";
final static File TRAINFOLDER = new File(TRAINFOLDERNAME);
// Hashmaps containing the counts of every word
private static Map<String, Integer> negativeCount = new HashMap<>();
private static Map<String, Integer> positiveCount = new HashMap<>();
private static double num_positive_reviews = 0;
private static double num_negative_reviews = 0;
public static void main(String[] args) throws IOException
{
ArrayList<String> files = readFiles(TRAINFOLDER);
train(files);
//if command line argument is "evaluate", runs evaluation mode
if (args.length==1 && args[0].equals("evaluate")){
evaluate();
}
else{//otherwise, runs interactive mode
@SuppressWarnings("resource")Scanner scan = new Scanner(System.in);
System.out.print("Text to classify>> ");
String textToClassify = scan.nextLine();
System.out.println("Result: "+classify(textToClassify));
}
}
/*
* Takes as parameter the name of a folder and returns a list of filenames (Strings)
* in the folder.
*/
public static ArrayList<String> readFiles(File folder){
System.out.println("Populating list of files");
//List to store filenames in folder
ArrayList<String> filelist = new ArrayList<String>();
for (File fileEntry : folder.listFiles()) {
String filename = fileEntry.getName();
filelist.add(filename);
}
return filelist;
}
/*
* Takes as a parameter a list of filenames and trains Naive Bayes sentiment
* analyzer on them by counting occurences of words.
*/
public static void train(ArrayList<String> files) throws FileNotFoundException
{
Map<String, Integer> m;
char rating;
Scanner scan;
//loop through filenames
for (String filename: files){
rating = filename.charAt(filename.indexOf('-') + 1);
if (rating=='1'){ //negative rating
num_negative_reviews++;
m = negativeCount;
} else { //positive rating
num_positive_reviews++;
m = positiveCount;
}
scan = new Scanner(new File(TRAINFOLDERNAME + "/" + filename));
// extract words without punctuation
scan.useDelimiter(("[ )('\"/\\:;@,!?.-]+"));
String s;
//Loop through words and add to hashmap
while (scan.hasNext()){
s = scan.next();
s = s.toLowerCase();
m.put(s, m.getOrDefault(s, 0) + 1);
}
scan.close();
}
}
/*
* Classifier: Classifies the input text (type: String) as positive or negative
*/
public static String classify(String text)
{
double smoothing_coef = 0.07;
//store sum of features probability
double pos_sum = 0;
double neg_sum = 0;
text = text.toLowerCase();
String [] words = text.split("[ )('\"/\\:;@,!?.-]+");
double total_num_unique_pos_words = positiveCount.size();
double total_num_unique_neg_words = negativeCount.size();
//loop through words and calculate the combined probabilities of each
//word being in a pos/neg review
for(int i=0; i<words.length; i++){
double prob_word_pos = (positiveCount.getOrDefault(words[i],0)+smoothing_coef)/(total_num_unique_pos_words+(smoothing_coef*words.length));
pos_sum = pos_sum + (Math.log(prob_word_pos));
double prob_word_neg = (negativeCount.getOrDefault(words[i],0)+smoothing_coef)/(total_num_unique_neg_words+(smoothing_coef*words.length));
neg_sum = neg_sum + (Math.log(prob_word_neg));
}
//calculate final probabilities
double prob_positive = num_positive_reviews / (num_positive_reviews+num_negative_reviews);
double prob_negative = num_negative_reviews / (num_positive_reviews+num_negative_reviews);
double prob_text_pos = pos_sum + Math.log(prob_positive);
double prob_text_neg = neg_sum + Math.log(prob_negative);
//maximum likelihood
if (prob_text_neg > prob_text_pos){
return "negative";
}
else{
return "positive";
}
}
/*
* Runs sentiment analyzer on test files and prints
* accuracy and precision of results
*/
public static void evaluate() throws FileNotFoundException
{
@SuppressWarnings("resource")
Scanner scan = new Scanner(System.in);
System.out.print("Enter folder name of files to classify: ");
String foldername = scan.nextLine();
File folder = new File(foldername);
ArrayList<String> filesToClassify = readFiles(folder);
int totalClassifiedPositive = 0; //number of reviews classified positive
int numCorrectPositive = 0; //number of positive reviews classified positive
int totalClassifiedNegative = 0; //number of reviews classified negative
int numCorrectNegative = 0; //number of negative reviews classified negative
char real_rating;
//Loop through files and evaluate classification of extracted text
for (String filename: filesToClassify){
real_rating = filename.charAt(filename.indexOf('-') + 1);
scan = new Scanner(new File(foldername + "/" + filename));
while (scan.hasNext()){
String classified = classify(scan.nextLine());
if (classified=="negative"){
totalClassifiedNegative++;
if (real_rating=='1'){//correct!
numCorrectNegative++;
}
}
else if (classified=="positive"){
totalClassifiedPositive++;
if (real_rating=='5'){//correct!
numCorrectPositive++;
}
}
}
}
//total number of reviews classified correctly
double totalCorrect = numCorrectPositive + numCorrectNegative;
//total number of reviews classified
double totalClassified = totalClassifiedPositive + totalClassifiedNegative;
double posPrecision = ((double)numCorrectPositive/(double)totalClassifiedPositive)*100;
double negPrecision = ((double)numCorrectNegative/(double)totalClassifiedNegative)*100;
double accuracy = (double)totalCorrect/totalClassified * 100;
//format and print as percentages
System.out.println("Accuracy: " + Math.floor(accuracy * 100)/100 + "%");
System.out.println("Precision (Positive): " + Math.floor(posPrecision*100)/100 + "%");
System.out.println("Precision (Negative): " + Math.floor(negPrecision*100)/100 + "%");
}
}