-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathData607FinalProject.Rmd
More file actions
266 lines (211 loc) · 8.45 KB
/
Data607FinalProject.Rmd
File metadata and controls
266 lines (211 loc) · 8.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
---
title: "Data 607 Final Project"
author: "Ann Liu-Ferrara"
date: "May 14, 2017"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Yelp Doctor Online Review Sentiment Analysis
Geting 1000 doctor reviews in NYC area using Yelp Fusion API, and web scraping reviews from multiple web pages, calculating doctor sentiment scores by formula sum(postive reviews) - sum(negative reviews), and exploring the relationships between reviews and rating.
```{r echo = TRUE}
library(httr)
library(dplyr)
library(rvest)
library(devtools)
httr::set_config( config( ssl_verifypeer = 0L ) )
# devtools::install_github("jennybc/ryelp", force = TRUE)
library(yelpr)
library(jsonlite)
library(httr)
library(purrr)
library(plyr)
source('Credential.R')
yelp <- "https://api.yelp.com"
surl <- "https://api.yelp.com/oauth2/token"
yelp_app <- oauth_app("yelp",
key = Sys.getenv("YELP_ID"),
secret = Sys.getenv("YELP_SECRET"))
yelp_endpoint <- oauth_endpoint(NULL,
authorize = surl,
access = surl)
token <- oauth2.0_token(
yelp_endpoint,
yelp_app,
user_params = list(grant_type = "client_credentials"),
use_oob = TRUE
)
# search query
term <- "Doctor"
location <- "New York"
limit <- 50
offset <- seq(0, 950, 50)
set1 <- data.frame()
for(i in 1:20) {
(url <- modify_url(
yelp,
path = c("v3", "businesses", "search"),
query = list(
term = term,
location = location,
limit = limit,
offset = offset[i]
)
))
locationdata = GET(url, config(token = token))
listMembersContent = content(locationdata)
listMembers = jsonlite::fromJSON(toJSON(listMembersContent))
yelpResults = tryCatch({
data.frame(listMembers)
}, error = function(e) {
NULL
})
if (!is.null(yelpResults)) {
set1 <-
rbind(
set1,
data.frame(
'id' = unlist(yelpResults$businesses.id),
'name' = unlist(yelpResults$businesses.name),
'city' = unlist(yelpResults$businesses.location$city),
'state' = unlist(yelpResults$businesses.location$state),
'zip_code' = unlist(yelpResults$businesses.location$zip_code),
'country' = unlist(yelpResults$businesses.location$country),
'rating' = unlist(yelpResults$businesses.rating),
'latitude' = unlist(yelpResults$businesses.coordinate$latitude),
'longitude' = unlist(yelpResults$businesses.coordinate$longitude),
'url' = unlist(yelpResults$businesses.url),
'review_count' = unlist(yelpResults$businesses.review_count),
'phone' = unlist(yelpResults$businesses.phone)
)
)
}
}
set1 <- set1 %>% distinct(name, .keep_all = TRUE)
save(set1, file = "data.Rda")
attach("data.Rda")
# Pulled from Hu and Liu-University of Illinois @ Chicago. See References
pos <- scan('positive-words.txt', what='character', comment.char=';')
neg <- scan('negative-words.txt', what='character', comment.char=';')
library(plyr)
library(stringr)
score.sentiment = function(sentences, good_text, bad_text, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, good_text, bad_text) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
#to remove emojis
# sentence <- iconv(sentence, 'UTF-8', 'ASCII')
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, good_text)
neg.matches = match(words, bad_text)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, good_text, bad_text, .progress=.progress )
scores.df = data.frame(score=sum(scores), id = unique(set1$id[i]))
return(scores.df)
}
# data were run over night and saved in RData file
# scores.df <- data.frame()
# for(i in 1:nrow(set1)){
# n <- set1$review_count[i]
# # review urls
# surl <- sapply(seq(0, n, 20), function(x) paste0('https://www.yelp.com/biz/', set1$id[i], '?start=', x))
# # review Table
# scores <- 0
# for(each in surl) {
# # print(paste0("surl: ", each))
# htmlF <- read_html(each, simplifyVector = TRUE)
# reviews <- htmlF %>%
# html_nodes('.review-content p') %>%
# html_text()
#
# # Call the function and return a data frame
# scores.df <- rbind(scores.df, score.sentiment(reviews, pos, neg, .progress='text'))
# }
# # print(paste0("scores: ", scores))
# scores.df
# }
#
# save(scores.df, file = "scores.Rda")
attach("scores.Rda")
df <- ddply(scores.df, ~id, summarise, score = sum(score))
df <- merge(df, set1)
save(df, file = "df.Rda")
library(ggplot2)
# histogram for review counts
ggplot(data=df, aes(x = review_count)) +
geom_histogram(aes(y =..density..), col="red", alpha = .2) +
geom_density(col=1) +
labs(title="Histogram for Review Count") +
labs(x="Review Count", y="")
# histogram for score
ggplot(data=df, aes(x = score)) +
geom_histogram(aes(y =..density..), col="red", alpha = .2) +
geom_density(col=1) +
labs(title="Histogram for Score") +
labs(x="Score", y="")
# histogram for rating
ggplot(data=df, aes(x = rating)) +
geom_histogram(aes(y =..density..), col="red", alpha = .2) +
geom_density(col=1) +
labs(title="Histogram for Rating") +
labs(x="rating", y="")
# rating freq
table(df$rating)
# top 6 freq of review counts
head(table(df$review_count, df$rating))
# rating vs review counts
ggplot(df, aes(x = factor(rating) , y = review_count)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(position = position_jitter(height = 0, width = 0.25), shape = 1, alpha = 0.4, color = "blue") +
labs(title="Doctor Rating vs. Review Counts") +
labs(x="Rating", y="Review Counts") +
geom_hline(yintercept = 0, size = 1, color = "darkgreen") +
BDbasics::theme_bd()
# rating vs score
ggplot(df, aes(x = factor(rating) , y = score)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(position = position_jitter(height = 0, width = 0.25), shape = 1, alpha = 0.4, color = "blue") +
labs(title="Doctor Rating vs. Score") +
labs(x="Rating", y="Score") +
geom_hline(yintercept = 0, size = 1, color = "darkgreen") +
BDbasics::theme_bd()
```
# Findings:
1. score has a long tail, the most frequent scores are around 0
2. majority doctors have less than 20 reviews
3. rating vs. review count. Rating 1-star and 5-star are associated with least reviews
4. rating and score are perfectly correlated between before 5-start rating.
# Recommendation:
To use Yelp doctor reviews wisely, not only looking into how many stars the doctors have, but also check out review numbers and review details.
# Challenges:
1. Yelp Fusion API OAuth 2.0 has less documentation and support, use API v2 documentation caused confusion.Download Postman for API keys.
2. multiple review pages were received via web scraping, sum up all reviews for each doctor.
3. using Shiny ggvis package to create interactive app, it is more challenging combining ggplot together to create reactive charts.
# Reference:
https://github.com/jennybc/yelpr
https://github.com/Yelp/yelp-fusion/issues/59
http://amunategui.github.io/yelp-cross-country-trip/
http://journal.r-project.org/archive/2013-1/kahle-wickham.pdf
https://www.r-bloggers.com/how-to-use-r-to-scrape-tweets-super-tuesday-2016/
A list 6800 of English positive and negative opinion words - (Hu and Liu, KDD-2004), University of Illinois @ Chicago
https://shiny.rstudio.com/gallery/movie-explorer.html
https://gist.github.com/mylesmharrison/8886272