###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 3 Seminar: T-test for Difference in Means and Hypothesis Testing
#
#

# Set your working directory
#
# CAUTION: Make sure the directory you specify here matches the working directory on your computer.
#          We're using N:/PUBLG100 only for illustration purposes and it would only work if you're 
#          using a UCL dekstop. If you're using your own laptop, then replace N:/PUBLG100 with the 
#          appropriate directory (or folder)
setwd("N:/PUBLG100")


# Verify that your working directory is set correctly
getwd()


## ------------------------------------------------------------------------
rm(list=ls()) # clear workspace

## ------------------------------------------------------------------------
library(foreign) # to work with foreign file formats

# loading a STATA format dataset (remember to load the library foreign 1st)
world.data <- read.dta("QoG2012.dta")

# the dimensions: rows (observations) and columns (variables) 
dim(world.data)

# the variable names
names(world.data) 

# let's look at the first few observations
head(world.data)

## ------------------------------------------------------------------------
install.packages("dplyr")

## ----message=FALSE-------------------------------------------------------
# load dplyr
library(dplyr)

## ------------------------------------------------------------------------
# dataset.name <- rename(argument1, argument2 = argument3)
# h_j = 1 means there is an independent judiciary
# rename h_j to judiciary

# rename a variable and save the result in our data frame
world.data <- rename(world.data, judiciary = h_j)

# check the result
names(world.data)

## ------------------------------------------------------------------------
# frequency table of binary independent variable
table(world.data$judiciary)

## ------------------------------------------------------------------------
# creating a factor variable
world.data$judiciary <- factor(world.data$judiciary, 
                               labels = c("independent", "controlled"), 
                               levels = c(1, -5))

# checking the result
head(world.data)

# a frequency table of judiciary
table(world.data$judiciary)

## ------------------------------------------------------------------------
summary(world.data$wdi_gdpc)

## ------------------------------------------------------------------------
# creating subsets of our data based on the status of the judiciary
free.legal <- filter(world.data, judiciary == "independent")
controlled.legal <- filter(world.data, judiciary == "controlled")

## ------------------------------------------------------------------------
# mean income levels, we remove missings
mean(free.legal$wdi_gdpc, na.rm = TRUE)
mean(controlled.legal$wdi_gdpc, na.rm = TRUE)

## ------------------------------------------------------------------------
# t.test
# Interval DV (GDP per captia)
# Binary IV (independent judiciary)
t.test(world.data$wdi_gdpc ~ world.data$judiciary, mu=0, alt="two.sided", conf=0.95)

## ------------------------------------------------------------------------
# renaming variables
world.data <- rename(world.data, hdi = undp_hdi)
world.data <- rename(world.data, corruption.control = wbgi_cce)

## ------------------------------------------------------------------------
# scatterplot
plot(x = world.data$corruption.control,
     y = world.data$hdi,
     xlim = c(xmin = -2, xmax = 3),
     ylim = c(ymin = 0, ymax = 1),
     frame = FALSE,
     xlab = "World Bank Control of Corruption Index",
     ylab = "UNDP Human Development Index",
     main = "Relationship b/w Quality of Institutions and Quality of Life")

## ------------------------------------------------------------------------
#  Pearson's r including test statistic
cor.test(world.data$corruption.control, 
         world.data$hdi, use="complete.obs", 
         conf.level = 0.99)