-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutilities.R
More file actions
73 lines (56 loc) · 1.84 KB
/
utilities.R
File metadata and controls
73 lines (56 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
'
Script : util
Created : February, 2015
Author(s) : iHub Research
Version : v2.0
License : Apache License, Version 2.0
'
CreateTdm <- function(text.data, stopwords='english',
removePunctuation=TRUE, tolower=TRUE) {
# create corpus
corpus <-Corpus(VectorSource(text.data))
# create term document matrix
tdm <-TermDocumentMatrix(corpus,
control=list(removePunctuation=removePunctuation,
stopwords=stopwords, tolower=tolower))
return(tdm)
}
FilterDuplicates <- function(text.data) {
# filter exact duplicates
text.data <- unique(as.vector(text.data))
# use fuzzy string matching to remove text with similar patterns
# create container for holding indices to remove
index.to.remove <- c()
# find similar String Distance Metric
for (i in 1:length(text.data)) {
for (j in 1:length(text.data)) {
# calculate levenshtein dist
lv.dist <- stringdist(text.data[i], text.data[j], method='lv')
# avoid redundancy
if (i !=j & i-j>=0 ){
# get index if it's less than max lv distance
if(length(lv.dist) == 0){
next # skip iteration is lv.dist is null
} else if (lv.dist <= 140 - nchar(RemoveLink(text.data[i])) & lv.dist != 0) {
index.to.remove <- rbind(i, index.to.remove)
}
}
}
}
# remove indices
if(length(index.to.remove) != 0) {
text.data <- text.data[-c(index.to.remove)]
}
return(text.data)
}
GetLowerCorrLimit <- function(text.data, factor=1.0) {
# compute optimal lower correlation limit
tokens = MC_tokenizer(text.data)
tokens.unique.len = length(unique(tokens))
corr.limit = tokens.unique.len / length(tokens) * factor
return(corr.limit)
}
RemoveLink <-function(text){
text = gsub("http.*", "", text)
return(text)
}