diff --git a/README.md b/README.md new file mode 100644 index 0000000..660615a --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# Twitter-L-LDA +A set of tools for performing Labeled Latent Dirichlet Allocation on textual datasets, with an emphasis on Twitter profiles. Contains tools for analysing the results of model training and inference. + +## My Use/Scope +- I used this code as a guide to perform stop word and stemming in a tfidf algorithm for a web blog software +- Created some pull requests for bug found in original repository diff --git a/util/Stopwords.java b/util/Stopwords.java index 4c21e22..2061837 100644 --- a/util/Stopwords.java +++ b/util/Stopwords.java @@ -16,18 +16,18 @@ public class Stopwords { public static boolean isStopword(String word) { if(word.length() < 2) return true; if(word.charAt(0) >= '0' && word.charAt(0) <= '9') return true; //remove numbers, "25th", etc - if(stopWordSet.contains(word)) return true; + if(stopWordSet.contains(word.toLowerCase())) return true; else return false; } public static boolean isStemmedStopword(String word) { if(word.length() < 2) return true; if(word.charAt(0) >= '0' && word.charAt(0) <= '9') return true; //remove numbers, "25th", etc - String stemmed = stemString(word); - if(stopWordSet.contains(stemmed)) return true; - if(stemmedStopWordSet.contains(stemmed)) return true; - if(stopWordSet.contains(word)) return true; - if(stemmedStopWordSet.contains(word)) return true; + String stemmed = stemString(word.toLowerCase()); + if(stopWordSet.contains(stemmed.toLowerCase())) return true; + if(stemmedStopWordSet.contains(stemmed.toLowerCase())) return true; + if(stopWordSet.contains(word.toLowerCase())) return true; + if(stemmedStopWordSet.contains(word.toLowerCase())) return true; else return false; } @@ -36,7 +36,7 @@ public static String removeStopWords(String string) { String[] words = string.split("\\s+"); for(String word : words) { if(word.isEmpty()) continue; - if(isStopword(string)) continue; //remove stopwords + if(isStopword(word)) continue; //remove stopwords result += (word+" "); } return result;