import Pkg
Pkg.activate(".")
using CSV
using DataFrames
using Statistics
using HypothesisTests
Activating project at `~/sandbox/dataalltheway/posts/010-01-parametric-hypothesis-tests-julia`
2022-11-17 First draft
This article is an extension of Rohit Farmer. 2022. “Parametric Hypothesis Tests with Examples in R.” November 10, 2022. Please check out the parent article for the theoretical background.
Some cleaning is necessary since the data is not of the correct types.
begin
data = CSV.read(download("https://raw.githubusercontent.com/opencasestudies/ocs-bp-rural-and-urban-obesity/master/data/wrangled/BMI_long.csv"), DataFrame) # download and load
allowmissing!(data, :BMI) # Allow BMI col to have missing values
replace!(data.BMI, "NA" => missing) # Convert "NA" to missing
data[!, :BMI] .= passmissing(parse).(Float64, (data[!, :BMI])) # Typecast into Float64?
end;
Row | Country | Sex | Region | Year | BMI |
---|---|---|---|---|---|
String | String7 | String15 | Int64 | Float64? | |
1 | Afghanistan | Men | National | 1985 | 20.2 |
2 | Afghanistan | Men | Rural | 1985 | 19.7 |
3 | Afghanistan | Men | Urban | 1985 | 22.4 |
4 | Afghanistan | Men | National | 2017 | 22.8 |
5 | Afghanistan | Men | Rural | 2017 | 22.5 |
6 | Afghanistan | Men | Urban | 2017 | 23.6 |
7 | Afghanistan | Women | National | 1985 | 20.6 |
8 | Afghanistan | Women | Rural | 1985 | 20.1 |
9 | Afghanistan | Women | Urban | 1985 | 23.2 |
10 | Afghanistan | Women | National | 2017 | 24.4 |
11 | Afghanistan | Women | Rural | 2017 | 23.6 |
12 | Afghanistan | Women | Urban | 2017 | 26.3 |
13 | Albania | Men | National | 1985 | 25.2 |
14 | Albania | Men | Rural | 1985 | 25.0 |
15 | Albania | Men | Urban | 1985 | 25.4 |
16 | Albania | Men | National | 2017 | 27.0 |
17 | Albania | Men | Rural | 2017 | 26.9 |
18 | Albania | Men | Urban | 2017 | 27.0 |
19 | Albania | Women | National | 1985 | 26.0 |
20 | Albania | Women | Rural | 1985 | 26.1 |
uneqvarztest = let
# Fetch a random sample of BMI data for women in the year 1985 and 2017
x1 = filter([:Sex, :Year] => (s, y) -> s=="Women" && y==1985 , data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x2 = filter([:Sex, :Year] => (s, y) -> s=="Women" && y==2017 , data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
UnequalVarianceZTest(x1, x2)
end
Two sample z-test (unequal variance)
------------------------------------
Population details:
parameter of interest: Mean difference
value under h_0: 0
point estimate: -2.26
95% confidence interval: (-2.679, -1.841)
Test summary:
outcome with 95% confidence: reject h_0
two-sided p-value: <1e-25
Details:
number of observations: [300,300]
z-statistic: -10.560590588866509
population standard error: 0.21400318296427412
eqvarztest = let
# Fetch a random sample of BMI data for women in the year 1985 and 2017
x1 = filter([:Sex, :Year] => (s, y) -> s=="Women" && y==1985 , data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x2 = filter([:Sex, :Year] => (s, y) -> s=="Women" && y==2017 , data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
EqualVarianceZTest(x1, x2)
end
Two sample z-test (equal variance)
----------------------------------
Population details:
parameter of interest: Mean difference
value under h_0: 0
point estimate: -2.173
95% confidence interval: (-2.611, -1.735)
Test summary:
outcome with 95% confidence: reject h_0
two-sided p-value: <1e-21
Details:
number of observations: [300,300]
z-statistic: -9.724414586039652
population standard error: 0.22345818154642977
onesamplettest = let
x1 = filter(
[:Sex, :Region, :Year] =>
(s, r, y) -> s=="Men" && r=="Rural" && y == 2017,
data
) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
OneSampleTTest(x1, 24.5)
end
One sample t-test
-----------------
Population details:
parameter of interest: Mean
value under h_0: 24.5
point estimate: 25.466
95% confidence interval: (25.16, 25.77)
Test summary:
outcome with 95% confidence: reject h_0
two-sided p-value: <1e-08
Details:
number of observations: 300
t-statistic: 6.280721563263261
degrees of freedom: 299
empirical standard error: 0.15380398418714467
unpairedtwosamplettest = let
x1 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Rural" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x2 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Urban" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
UnequalVarianceTTest(x1, x2)
end
Two sample t-test (unequal variance)
------------------------------------
Population details:
parameter of interest: Mean difference
value under h_0: 0
point estimate: -1.05867
95% confidence interval: (-1.512, -0.6054)
Test summary:
outcome with 95% confidence: reject h_0
two-sided p-value: <1e-05
Details:
number of observations: [300,300]
t-statistic: -4.587387795167387
degrees of freedom: 575.968012373301
empirical standard error: 0.2307776699807073
This test uses the Welch correction, and there is no way to turn it off in HypothesisTests.jl
.
unpairedtwosamplettest = let
x1 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Rural" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x2 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Urban" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
UnequalVarianceTTest(x1, x2)
end
pvalue(unpairedtwosamplettest, tail=:right)
0.9999999445762
pairedtwosamplettest = let
x1 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Rural" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x2 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Urban" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
EqualVarianceTTest(x1, x2)
end
Two sample t-test (equal variance)
----------------------------------
Population details:
parameter of interest: Mean difference
value under h_0: 0
point estimate: -1.01167
95% confidence interval: (-1.44, -0.5838)
Test summary:
outcome with 95% confidence: reject h_0
two-sided p-value: <1e-05
Details:
number of observations: [300,300]
t-statistic: -4.64337449574737
degrees of freedom: 598
empirical standard error: 0.2178731583233696
Ftest = let
x1 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Rural" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x2 = filter([:Sex, :Region, :Year] =>
(s, r, y) -> s=="Women" && r=="Urban" && y == 1985,
data) |>
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
x -> x[!, :BMI] |> skipmissing |> collect |> x->rand(x, 300)
VarianceFTest(x1, x2)
end
Variance F-test
---------------
Population details:
parameter of interest: variance ratio
value under h_0: 1.0
point estimate: 1.32245
Test summary:
outcome with 95% confidence: reject h_0
two-sided p-value: 0.0159
Details:
number of observations: [300, 300]
F statistic: 1.322447205183649
degrees of freedom: [299, 299]
Atest = let
x = filter([:Sex, :Year] => (s,y) -> (s=="Men" && y==2017), data)
groups = groupby(x, :Region)
bmis = map(keys(groups)) do key # for each group,
collect(skipmissing(groups[key][!, :BMI])) # collect BMI, skipping missing values
end
res = OneWayANOVATest(bmis...)
end
One-way analysis of variance (ANOVA) test
-----------------------------------------
Population details:
parameter of interest: Means
value under h_0: "all equal"
point estimate: NaN
Test summary:
outcome with 95% confidence: reject h_0
p-value: 0.0333
Details:
number of observations: [200, 196, 199]
F statistic: 3.42167
degrees of freedom: (2, 592)
Currently, there is no implementation of this test in HypothesisTests.jl
@online{sambrani2022,
author = {Sambrani, Dhruva},
title = {Parametric Hypothesis Tests with Examples in {Julia}},
date = {2022-11-17},
url = {https://dataalltheway.com/posts/010-01-parametric-hypothesis-tests-julia},
langid = {en}
}