TheDataLabScotland · rmnppt · Oct 1, 2018 · Oct 1, 2018 · Oct 1, 2018 · Oct 1, 2018
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,4 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^data-raw$
+.travis.yml
diff --git a/.Rprofile b/.Rprofile
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 packrat/lib*/
+.Rhistory
 .Rproj.user
 .DS_Store
diff --git a/.travis.yml b/.travis.yml
@@ -3,3 +3,4 @@
 language: R
 sudo: false
 cache: packages
+r_build_args: "--resave-data"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -3,12 +3,12 @@ Type: Package
 Title: Calculating the Scottish Index of Multiple Deprivation
 Version: 0.1.0
 Author: Roman Popat
-Maintainer: The package maintainer <roman@legibledata.org>
+Maintainer: The package maintainer <roman@datakitchen.co.uk>
 Description: Data and functions to calculate the Scottish Index of Multiple Deprivation from 7 different indicator domains.
-License: MIT
+License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.0
 Imports: 
     dplyr,
     psych,

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,3 +6,6 @@ export(getFAWeights)
 export(normalScores)
 export(reassignRank)
 export(replaceMissing)
+import(dplyr)
+importFrom(stats,na.omit)
+importFrom(stats,qnorm)
diff --git a/R/getFAWeights.R b/R/getFAWeights.R
@@ -14,12 +14,14 @@
 #'
 #' @keywords SIMD, openSIMD, simdr
 #'
+#' @import dplyr
+#'
 #' @export
 getFAWeights <- function(dat, ...) {
 
   fact <- psych::fa(dat, nfactors = 1, fm = "ml", rotate = "none", ...)
 
-  f1_scores <- as.data.frame(fact$weights) %>% select(ML1)
+  f1_scores <- as.data.frame(fact$weights) %>% select("ML1")
 
   f1_weights <- f1_scores / sum(f1_scores)
 

diff --git a/R/normalScores.R b/R/normalScores.R
@@ -29,6 +29,8 @@
 #' # (not run)
 #' # PUT SOME CODE HERE
 #'
+#' @importFrom stats na.omit qnorm
+#'
 #' @export
 
 normalScores <- function(

diff --git a/R/sas_weights.R b/R/sas_weights.R
@@ -1,14 +1,59 @@
 #' Factor analysis weights derived from SAS
 #'
-#' [describe it here]
+#' We include the Factor Analysis weights calculated in SAS from the 2016 SIMD calculations.
+#' This is mostly for quality control evaluating the correspondence between the SAS and R methodologies.
+#' To see our quality control analysis see [quality assurance section of the openSIMD documentation](https://thedatalabscotland.github.io/openSIMD_site/QA.html)
 #'
 #' Please see the technical notes \url{http://www.gov.scot/Resource/0050/00504822.pdf} for full information on all of the data in SIMD.
 #'
-#' @format A data frame with ___ rows and ___ variables:
+#' @format A data frame with 1 rows and 43 variables:
 #' \describe{
-#'   \item{variable}{description}
-#'   \item{}{Count of individuals with no qualifications}
-#'   \item{}{Normalised count, derived from SIMD proceedure in SAS}
+#'
+#'   \item{n_neet}{}
+#'   \item{n_noquals}{}
+#'   \item{n_attain}{}
+#'   \item{n_attend}{}
+#'   \item{n_hesa}{}
+#'   \item{n_cif}{}
+#'   \item{n_smr}{}
+#'   \item{n_lbwt}{}
+#'   \item{n_drug}{}
+#'   \item{n_alc}{}
+#'   \item{n_depress}{}
+#'   \item{n_emerg}{}
+#'   \item{n_dr_gp}{}
+#'   \item{n_dr_post}{}
+#'   \item{n_dr_prim}{}
+#'   \item{n_dr_sec}{}
+#'   \item{n_dr_retail}{}
+#'   \item{n_dr_petrol}{}
+#'   \item{n_pt_gp}{}
+#'   \item{n_pt_post}{}
+#'   \item{n_pt_retail}{}
+#'
+#'   \item{wt_neet}{}
+#'   \item{wt_noquals}{}
+#'   \item{wt_attain}{}
+#'   \item{wt_attend}{}
+#'   \item{wt_hesa}{}
+#'   \item{wt_cif}{}
+#'   \item{wt_smr}{}
+#'   \item{wt_lbwt}{}
+#'   \item{wt_drug}{}
+#'   \item{wt_alc}{}
+#'   \item{wt_depress}{}
+#'   \item{wt_emerg}{}
+#'   \item{wt_dr_gp}{}
+#'   \item{wt_dr_post}{}
+#'   \item{wt_dr_prim}{}
+#'   \item{wt_dr_sec}{}
+#'   \item{wt_dr_retail}{}
+#'   \item{wt_dr_petrol}{}
+#'   \item{wt_pt_gp}{}
+#'   \item{wt_pt_post}{}
+#'   \item{wt_pt_retail}{}
+#'
+#'   \item{n}{}
 #'   ...
 #' }
 #' @source \url{http://www.gov.scot/Topics/Statistics/SIMD}

diff --git a/R/sim16_domains.R b/R/sim16_domains.R
@@ -1,14 +1,25 @@
 #' Domain ranks from published SIMD 2016
 #'
-#' [describe it here]
+#' We include the domain ranks from the 2016 SIMD for purposes of evaluating the correspondence between the SAS and R methodologies.
+#' To see our quality control analysis see [quality assurance section of the openSIMD documentation](https://thedatalabscotland.github.io/openSIMD_site/QA.html)
 #'
 #' Please see the technical notes \url{http://www.gov.scot/Resource/0050/00504822.pdf} for full information on all of the data in SIMD.
 #'
-#' @format A data frame with ___ rows and ___ variables:
+#' @format A data frame with 6976 rows and 13 variables:
 #' \describe{
-#'   \item{variable}{description}
-#'   \item{}{Count of individuals with no qualifications}
-#'   \item{}{Normalised count, derived from SIMD proceedure in SAS}
+#'   \item{Data_Zone}{}
+#'   \item{Intermediate_Zone}{}
+#'   \item{Council_area}{}
+#'   \item{Total_population}{}
+#'   \item{Working_age_population_revised}{}
+#'   \item{Overall_SIMD16_rank}{}
+#'   \item{Income_domain_2016_rank}{}
+#'   \item{Employment_domain_2016_rank}{}
+#'   \item{Health_domain_2016_rank}{}
+#'   \item{Education_domain_2016_rank}{}
+#'   \item{Housing_domain_2016_rank}{}
+#'   \item{Access_domain_2016_rank}{}
+#'   \item{Crime_domain_2016_rank}{}
 #'   ...
 #' }
 #' @source \url{http://www.gov.scot/Topics/Statistics/SIMD}

diff --git a/R/simd16_indicators.R b/R/simd16_indicators.R
@@ -1,14 +1,48 @@
 #' Indicator data from published SIMD 2016
 #'
-#' [describe it here]
+#' We include the indicator data from the 2016 SIMD for purposes of evaluating the correspondence between the SAS and R methodologies.
+#' To see our quality control analysis see [quality assurance section of the openSIMD documentation](https://thedatalabscotland.github.io/openSIMD_site/QA.html)
 #'
 #' Please see the technical notes \url{http://www.gov.scot/Resource/0050/00504822.pdf} for full information on all of the data in SIMD.
 #'
-#' @format A data frame with ___ rows and ___ variables:
+#' @format A data frame with 6976 rows and 36 variables:
 #' \describe{
-#'   \item{variable}{description}
-#'   \item{}{Count of individuals with no qualifications}
-#'   \item{}{Normalised count, derived from SIMD proceedure in SAS}
+#'   \item{Data_Zone}{description}
+#'   \item{Intermediate_Zone}{description}
+#'   \item{Council_area}{description}
+#'   \item{Total_population}{description}
+#'   \item{Working_age_population_revised}{description}
+#'   \item{Income_rate}{description}
+#'   \item{Income_count}{description}
+#'   \item{Employment_rate}{description}
+#'   \item{Employment_count}{description}
+#'   \item{CIF}{description}
+#'   \item{ALCOHOL}{description}
+#'   \item{DRUG}{description}
+#'   \item{SMR}{description}
+#'   \item{DEPRESS}{description}
+#'   \item{LBWT}{description}
+#'   \item{EMERG}{description}
+#'   \item{Attendance}{description}
+#'   \item{Attainment}{description}
+#'   \item{Noquals}{description}
+#'   \item{NEET}{description}
+#'   \item{HESA}{description}
+#'   \item{drive_petrol}{description}
+#'   \item{drive_GP}{description}
+#'   \item{drive_PO}{description}
+#'   \item{drive_primary}{description}
+#'   \item{drive_retail}{description}
+#'   \item{drive_secondary}{description}
+#'   \item{PT_GP}{description}
+#'   \item{PT_Post}{description}
+#'   \item{PT_retail}{description}
+#'   \item{crime_count}{description}
+#'   \item{crime_rate}{description}
+#'   \item{overcrowded_count}{description}
+#'   \item{nocentralheat_count}{description}
+#'   \item{overcrowded_rate}{description}
+#'   \item{nocentralheat_rate}{description}
 #'   ...
 #' }
 #' @source \url{http://www.gov.scot/Topics/Statistics/SIMD}

diff --git a/README.md b/README.md
@@ -1,10 +1,73 @@
-[![Build Status](https://travis-ci.org/TheDataLabScotland/simdr.svg?branch=master)](https://travis-ci.org/TheDataLabScotland/simdr)
+[![Build Status](https://travis-ci.org/rmnppt/simdr.svg?branch=master)](https://travis-ci.org/rmnppt/simdr)
 
-An R package to perform the calculation of the Scottish Index of Multiple Deprivation
+# The Scottish Index of Multiple Deprivation in R
+
+## Installation
 
 You can install this package with
 
-    devtools::install_github("TheDataLabScotland/simdr")
+```R
+devtools::install_github("rmnppt/simdr")
+```
+
+## Usage
+
+You can load in the 2016 simd domain data as follows:
+
+```R
+data("simd16_domains")
+```
+
+Or you might want to view  the more granular indicator data:
+
+```R
+data("simd16_indicators")
+```
+
+If you want to analyse the data in the way that the SIMD team does you can start by:
+
+ 1. Selecting the indicator variables belonging to a domain
+ 2. Transform them  to be normally distributed
+ 3. Replace any missing values
+
+Here is an example using education:
+
+```R
+library(dplyr)
+
+normalised_education <- simd16_indicators %>% # start with the raw data
+        select(Attendance, Attainment, Noquals, NEET, HESA) %>% # select relevant columns
+        mutate(Attendance = normalScores(Attendance, forwards = FALSE)) %>% # replace each column
+        mutate(Attainment = normalScores(Attainment, forwards = FALSE)) %>%
+        mutate(Noquals    = normalScores(Noquals, forwards = TRUE)) %>%
+        mutate(NEET       = normalScores(NEET, forwards = TRUE)) %>%
+        mutate(HESA       = normalScores(HESA, forwards = FALSE)) %>%
+        mutate_all(funs(replaceMissing)) # replace missing values
+```
+
+You will notice that the above gives a warning, there is some missing data. You may want to fill in the missing values, so we include a utility (`replaceMissing`) to replace missing and infinite values with 0, the center of the new normal distribution.
+
+Notice that when we call `normalScores` we can decide whether a high value indicates deprivation or not, see `?normalScores` for more detail.
+
+When combining the indicators to give a domain score, we need to apply a different weight to each. The weights are derived through factor analysis of the normalised indicator scores, and the proportional loadings on factor 1 serve as the weightings. We extract the loadings using the getFAWeights function as follows:
+
+```R
+education_weights <- getFAWeights(normalised_education)
+```
+
+Now that we have the normalised indicator scores and weights, we can combine them with the utility function `combineWeightsAndNorms`. Each normalised indicator variable is multiplied by its weight derived from factor analysis, as follows:
+
+```R
+education_score <- combineWeightsAndNorms(education_weights, normalised_education)
+```
+
+Finally we rank these weighted scores to generate the domain rank (1 = most deprived).
+
+```R
+education_rank <- rank(-education_score)
+```
+
+## Further Reading
 
 Find more information about the openSIMD project here:
 

diff --git a/data/no_qualifications.RData b/data/no_qualifications.RData
diff --git a/data/sas_simd_domains.RData b/data/sas_simd_domains.RData
diff --git a/data/sas_weights.RData b/data/sas_weights.RData
diff --git a/data/simd16_domains.RData b/data/simd16_domains.RData
diff --git a/data/simd16_indicators.RData b/data/simd16_indicators.RData
diff --git a/man/sas_weights.Rd b/man/sas_weights.Rd
diff --git a/man/simd16_domains.Rd b/man/simd16_domains.Rd