From 1187b35ac7beaa9a73bde978fa096bd6ea417be3 Mon Sep 17 00:00:00 2001 From: Dillon Date: Mon, 26 Feb 2018 20:53:06 -0500 Subject: [PATCH 1/5] change string to lowercase Do this because all stop words are lower case and this will then also eliminate capitalized words. --- util/Stopwords.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/Stopwords.java b/util/Stopwords.java index 4c21e22..639cf77 100644 --- a/util/Stopwords.java +++ b/util/Stopwords.java @@ -16,7 +16,7 @@ public class Stopwords { public static boolean isStopword(String word) { if(word.length() < 2) return true; if(word.charAt(0) >= '0' && word.charAt(0) <= '9') return true; //remove numbers, "25th", etc - if(stopWordSet.contains(word)) return true; + if(stopWordSet.contains(word.toLowerCase())) return true; else return false; } From b5c407e3eb99b96dbe2153db4460ff8a963d4714 Mon Sep 17 00:00:00 2001 From: Dillon Date: Tue, 27 Feb 2018 09:39:40 -0500 Subject: [PATCH 2/5] update readme to show scope --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..660615a --- /dev/null +++ b/README.md @@ -0,0 +1,6 @@ +# Twitter-L-LDA +A set of tools for performing Labeled Latent Dirichlet Allocation on textual datasets, with an emphasis on Twitter profiles. Contains tools for analysing the results of model training and inference. + +## My Use/Scope +- I used this code as a guide to perform stop word and stemming in a tfidf algorithm for a web blog software +- Created some pull requests for bug found in original repository From 74029179b5d2b60cefa1c80ba735e147b8443920 Mon Sep 17 00:00:00 2001 From: Dillon Date: Tue, 27 Feb 2018 11:12:41 -0500 Subject: [PATCH 3/5] fix code to handle capitalization --- util/Stopwords.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/util/Stopwords.java b/util/Stopwords.java index 639cf77..2061837 100644 --- a/util/Stopwords.java +++ b/util/Stopwords.java @@ -23,11 +23,11 @@ public static boolean isStopword(String word) { public static boolean isStemmedStopword(String word) { if(word.length() < 2) return true; if(word.charAt(0) >= '0' && word.charAt(0) <= '9') return true; //remove numbers, "25th", etc - String stemmed = stemString(word); - if(stopWordSet.contains(stemmed)) return true; - if(stemmedStopWordSet.contains(stemmed)) return true; - if(stopWordSet.contains(word)) return true; - if(stemmedStopWordSet.contains(word)) return true; + String stemmed = stemString(word.toLowerCase()); + if(stopWordSet.contains(stemmed.toLowerCase())) return true; + if(stemmedStopWordSet.contains(stemmed.toLowerCase())) return true; + if(stopWordSet.contains(word.toLowerCase())) return true; + if(stemmedStopWordSet.contains(word.toLowerCase())) return true; else return false; } @@ -36,7 +36,7 @@ public static String removeStopWords(String string) { String[] words = string.split("\\s+"); for(String word : words) { if(word.isEmpty()) continue; - if(isStopword(string)) continue; //remove stopwords + if(isStopword(word)) continue; //remove stopwords result += (word+" "); } return result; From 9f2d4f0be5abb3bdf257d2afa0e33efad7fe956a Mon Sep 17 00:00:00 2001 From: Dillon Date: Wed, 28 Feb 2018 09:33:41 -0500 Subject: [PATCH 4/5] Update Stopwords.java --- util/Stopwords.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/Stopwords.java b/util/Stopwords.java index 2061837..1ccbc1c 100644 --- a/util/Stopwords.java +++ b/util/Stopwords.java @@ -47,7 +47,7 @@ public static String removeStemmedStopWords(String string) { String[] words = string.split("\\s+"); for(String word : words) { if(word.isEmpty()) continue; - if(isStemmedStopword(word)) continue; + if(isStemmedStopword(word.toLowerCase())) continue; if(word.charAt(0) >= '0' && word.charAt(0) <= '9') continue; //remove numbers, "25th", etc result += (word+" "); } From fbd108695a7421fe6da3702ea683eb18db45cc20 Mon Sep 17 00:00:00 2001 From: Dillon Date: Wed, 28 Feb 2018 09:34:29 -0500 Subject: [PATCH 5/5] remove change --- util/Stopwords.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/Stopwords.java b/util/Stopwords.java index 1ccbc1c..2061837 100644 --- a/util/Stopwords.java +++ b/util/Stopwords.java @@ -47,7 +47,7 @@ public static String removeStemmedStopWords(String string) { String[] words = string.split("\\s+"); for(String word : words) { if(word.isEmpty()) continue; - if(isStemmedStopword(word.toLowerCase())) continue; + if(isStemmedStopword(word)) continue; if(word.charAt(0) >= '0' && word.charAt(0) <= '9') continue; //remove numbers, "25th", etc result += (word+" "); }