## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(fig.width = 6, fig.height = 4.5) 
options(digits = 4)

## ----load-packages, echo = FALSE, message = FALSE, warning = FALSE------------
library(dplyr)
library(infer)

## ----load-gss, warning = FALSE, message = FALSE-------------------------------
# load in the dataset
data(gss)

# take a look at its structure
dplyr::glimpse(gss)

## ----specify-example, warning = FALSE, message = FALSE------------------------
gss %>%
  specify(response = age)

## ----specify-one, warning = FALSE, message = FALSE----------------------------
gss %>%
  specify(response = age) %>%
  class()

## ----specify-two, warning = FALSE, message = FALSE----------------------------
# as a formula
gss %>%
  specify(age ~ partyid)

# with the named arguments
gss %>%
  specify(response = age, explanatory = partyid)

## ----specify-success, warning = FALSE, message = FALSE------------------------
# specifying for inference on proportions
gss %>%
  specify(response = college, success = "degree")

## ----hypothesize-independence, warning = FALSE, message = FALSE---------------
gss %>%
  specify(college ~ partyid, success = "degree") %>%
  hypothesize(null = "independence")

## ----hypothesize-40-hr-week, warning = FALSE, message = FALSE-----------------
gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40)

## ----generate-point, warning = FALSE, message = FALSE-------------------------
set.seed(1)

gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  generate(reps = 1000, type = "bootstrap")

## ----generate-permute, warning = FALSE, message = FALSE-----------------------
gss %>%
  specify(partyid ~ age) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute")

## ----calculate-point, warning = FALSE, message = FALSE------------------------
gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

## ----specify-diff-in-means, warning = FALSE, message = FALSE------------------
gss %>%
  specify(age ~ college) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate("diff in means", order = c("degree", "no degree"))

## ----utilities-examples-------------------------------------------------------
# find the point estimate
obs_mean <- gss %>%
  specify(response = hours) %>%
  calculate(stat = "mean")

# generate a null distribution
null_dist <- gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

## ----visualize, warning = FALSE, message = FALSE------------------------------
null_dist %>%
  visualize()

## ----visualize2, warning = FALSE, message = FALSE-----------------------------
null_dist %>%
  visualize() +
  shade_p_value(obs_stat = obs_mean, direction = "two-sided")

## ----get_p_value, warning = FALSE, message = FALSE----------------------------
# get a two-tailed p-value
p_value <- null_dist %>%
  get_p_value(obs_stat = obs_mean, direction = "two-sided")

p_value

## ----get_conf, message = FALSE, warning = FALSE-------------------------------
# generate a distribution like the null distribution, 
# though exclude the null hypothesis from the pipeline
boot_dist <- gss %>%
  specify(response = hours) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "mean")

# start with the bootstrap distribution
ci <- boot_dist %>%
  # calculate the confidence interval around the point estimate
  get_confidence_interval(point_estimate = obs_mean,
                          # at the 95% confidence level
                          level = .95,
                          # using the standard error
                          type = "se")

ci

## ----visualize-ci, warning = FALSE, message = FALSE---------------------------
boot_dist %>%
  visualize() +
  shade_confidence_interval(endpoints = ci)

## ----message = FALSE, warning = FALSE-----------------------------------------
# calculate an observed t statistic
obs_t <- gss %>%
  specify(response = hours) %>%
  hypothesize(null = "point", mu = 40) %>%
  calculate(stat = "t")

## ----message = FALSE, warning = FALSE-----------------------------------------
# switch out calculate with assume to define a distribution
t_dist <- gss %>%
  specify(response = hours) %>%
  assume(distribution = "t")

## ----message = FALSE, warning = FALSE-----------------------------------------
# visualize the theoretical null distribution
visualize(t_dist) +
  shade_p_value(obs_stat = obs_t, direction = "greater")

# more exactly, calculate the p-value
get_p_value(t_dist, obs_t, "greater")

## ----message = FALSE, warning = FALSE-----------------------------------------
# find the theory-based confidence interval
theor_ci <- 
  get_confidence_interval(
    x = t_dist,
    level = .95,
    point_estimate = obs_mean
  )

theor_ci

## -----------------------------------------------------------------------------
# visualize the theoretical sampling distribution
visualize(t_dist) +
  shade_confidence_interval(theor_ci)

## -----------------------------------------------------------------------------
observed_fit <- gss %>%
  specify(hours ~ age + college) %>%
  fit()

## -----------------------------------------------------------------------------
null_fits <- gss %>%
  specify(hours ~ age + college) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  fit()

null_fits

## -----------------------------------------------------------------------------
get_confidence_interval(
  null_fits, 
  point_estimate = observed_fit, 
  level = .95
)

## -----------------------------------------------------------------------------
visualize(null_fits) + 
  shade_p_value(observed_fit, direction = "both")