MPPSamples/Coding_Week2.R at main · MadeByUche/MPPSamples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#################################################
## PPHA 311  Statistics for Data Analysis II Winter 2024
## Professors Yukiko Asai & Dmitri Koustas & Austin Wright
## Coding TA session (Week 2)
## TA: Margot Bond and Jade Jiang
#################################################

# tell R not to use scientific notation
options(scipen = 999)
######
#PS2 Q1 Determinant of Wage
######
#Read data

##Set our working directory to point R to the folder that we saved the
#data that we are going to be working with. if we save any tables or figures
# during our R session, this folder is where all those will be saved by
# default too.
setwd("/Users/uchennaofforjebe/Documents/runitinR")

#remove object from my environment
rm(list = ls())
data1<-read.csv("data/code2.csv")
######
#Q1_1: Estimate the following bivariate regression model via OLS and interpret the size and significance
#of your estimates (i.e.,both Beta0_hat and Beta1_hat).
#wage = Beta0 + Beta1 educ +u

model1 <- lm(wage~educ, data = data1)
summary(model1)


#Q1_2: Now, estimate the following bivariate model via OLS. Interpret the ˆ γ1 coefficient. Using
#the regression result, discuss whether the model estimated in question (1) is likely to be an
#unbiased estimate of the return to education. Why or why not?
#  educ = Gamma0 + Gamma1 IQ + v
### ˆ γ1 <- this will cause trouble if you try to print or knit this as an unknown character. Replace with "Gamma"
### this can occur for other special characters

model2 <- lm(educ~IQ, data = data1)
summary(model2)


#Q1_3: Now, estimate the following multivariate regression model via OLS. Has your estimate of ˆ β1
#changed from what you found in question (1)?
#  wage = Beta0 + Beta1 educ + Beta2 IQ + u
### start with your Y - wage. Make sure that you do + and not , between the multivariate
model3 <- lm(wage ~ educ + IQ, data = data1)

summary(model3)

#Q1_4: Create a new variable calculating the natural log of wage and name this variable lwage.
#Construct a plot of the conditional expectation function (CEF) of hourly wages given years
#of education, and compare this to a plot of the CEF of log hourly wages given years of
#education. Re-estimate your regression in part (1) using lwage as given below and interpret
#the Beta1 coefficient. Which model do you think better fits the data?
#  lwage = Beta0 + Beta1 educ + u

data1$lwage <- log(data1$wage)

x = c(8:18) ### or c(8,9,10,11,12,13,14,15,16,17,18)
cm_levels = c(mean(data1$wage[data1$educ==8], na.rm = TRUE),
              mean(data1$wage[data1$educ==9], na.rm = TRUE),
               mean(data1$wage[data1$educ==10], na.rm = TRUE),
                mean(data1$wage[data1$educ==11], na.rm = TRUE),
                 mean(data1$wage[data1$educ==12], na.rm = TRUE),
                  mean(data1$wage[data1$educ==13], na.rm = TRUE),
                   mean(data1$wage[data1$educ==14], na.rm = TRUE),
                    mean(data1$wage[data1$educ==15], na.rm = TRUE),
                     mean(data1$wage[data1$educ==16], na.rm = TRUE),
                      mean(data1$wage[data1$educ==17], na.rm = TRUE),
                       mean(data1$wage[data1$educ==18], na.rm = TRUE)

                        )

cm_ln = c(mean(data1$wage[data1$educ==8], na.rm = TRUE),
              mean(data1$lwage[data1$educ==9], na.rm = TRUE),
              mean(data1$lwage[data1$educ==10], na.rm = TRUE),
              mean(data1$lwage[data1$educ==11], na.rm = TRUE),
              mean(data1$lwage[data1$educ==12], na.rm = TRUE),
              mean(data1$lwage[data1$educ==13], na.rm = TRUE),
              mean(data1$lwage[data1$educ==14], na.rm = TRUE),
              mean(data1$lwage[data1$educ==15], na.rm = TRUE),
              mean(data1$lwage[data1$educ==16], na.rm = TRUE),
              mean(data1$lwage[data1$educ==17], na.rm = TRUE),
              mean(data1$lwage[data1$educ==18], na.rm = TRUE)

)

##plot(x,cm_levels)
##plot(x, cm_ln)

plot(x,cm_ln, main = "Log Wage CEF", xlab = "Years of Education", ylab="Mean Log Wage")
### by default it is a point but you can specify line plot, etc - ex abline() to plot trend line. Plot = plot(x,y). More, plot(x,y, main = "title", ylab = "", xlab = "")

######
#PS2 Q2 Capital punishment
#####
#remove object from my environment
##rm(list = ls())
##Set our working directory to point R to the folder that we saved the
#data that we are going to be working with. if we save any tables or figures
# during our R session, this folder is where all those will be saved by
# default too.
##setwd("~/Desktop/stat2") # if Mac
##setwd("C:/user/Desktop/stat2") # if Windows

#read data
murder <- read.csv("data/murder.csv") ###FIPS is just an ID
######
#Q2_1 Create (a) a new variable with the murder rate per 10,000 population (murderrate) and (b)
#execution rate per 10,000 population (execrate).
#What is the mean murder rate per 10,000 across all states? Which state has the highest
#murder rate, and which state has the lowest? How many states had no executions in 1996?
#What is the largest number of executions?


murder$murderrate <- murder$murder/(murder$population/10000)
murder$exercrate <- murder$execution/(murder$population/10000)

mean(murder$murderrate)
max(murder$murderrate)
min(murder$murderrate)


#Q2_2. Estimate the effect of execrate on murderrate using OLS. Interpret the size and significance
#of your estimates. Does the estimated equation suggest a deterrent effect of capital
#punishment?


model5 <- lm(murderrate~exercrate, data = murder)
summary(model5)

#Q2_3. Reestimate your model restricting to states that had at least one execution in 1996. How
#does this affect your estimates?
### subset(df, variable, condition)
subset <- subset(murder, execution != 0)


model6 <- lm(murderrate~exercrate, data = subset)
summary(model6)

#Q2_4. Let’s try one more model. Create an 0/1 indicator variable for whether a state has any
#executions within the year. You can think about this as a proxy for whether the state uses
#the death penalty. Regress the murder rate on your new indicator variable. Does having
#capital punishment in the state appear to be a deterrent for murders?

#if else: ifelse(df$variable  condition, true value, false value)

murder$dummy <- ifelse(murder$execution !=0, 1, 0)
head(murder)

model7 <- lm(murderrate ~ dummy, data = murder)
summary(model7)

#appears not to be because the slope is positve. look at R square for how well this fits and if there is a lot of error.