Mixed ANOVA (incomplete)
2 x 2 ANOVA
Let’s create data to allow us to compare between 2 years, and between Europe and Americas
library(gapminder)
# create a new data frame that only focuses on data from 2007
<- subset(
gapminder_2_by_2 # the data set
gapminder, == 2002 & continent == "Africa" |
year == 2007 & continent == "Africa" |
year == 2002 & continent == "Europe" |
year == 2007 & continent == "Europe"
year
)
summary(lm(lifeExp ~ factor(year) * factor(continent), gapminder_2_by_2))
Call:
lm(formula = lifeExp ~ factor(year) * factor(continent), data = gapminder_2_by_2)
Residuals:
Min 1Q Median 3Q Max
-15.1930 -4.7759 -0.1898 3.1180 22.4188
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 53.3252 1.0921 48.830 <2e-16
factor(year)2007 1.4808 1.5444 0.959 0.339
factor(continent)Europe 23.3754 1.8055 12.947 <2e-16
factor(year)2007:factor(continent)Europe -0.5328 2.5533 -0.209 0.835
(Intercept) ***
factor(year)2007
factor(continent)Europe ***
factor(year)2007:factor(continent)Europe
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.875 on 160 degrees of freedom
Multiple R-squared: 0.6727, Adjusted R-squared: 0.6665
F-statistic: 109.6 on 3 and 160 DF, p-value: < 2.2e-16
summary(aov(lifeExp ~ factor(year) * factor(continent), gapminder_2_by_2))
Df Sum Sq Mean Sq F value Pr(>F)
factor(year) 1 68 68 1.093 0.297
factor(continent) 1 20319 20319 327.645 <2e-16 ***
factor(year):factor(continent) 1 3 3 0.044 0.835
Residuals 160 9922 62
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# create a new data frame that only focuses on data from 2007
= gapminder.loc[(gapminder['year'] == 2002) & (gapminder['continent'] == "Africa") | (gapminder['year'] == 2007) & (gapminder['continent'] == "Africa") | (gapminder['year'] == 2002) & (gapminder['continent'] == "Europe") | (gapminder['year'] == 2007) & (gapminder['continent'] == "Europe") ]
gapminder_2_by_2
from statsmodels.formula.api import ols
= ols('lifeExp ~ C(year) + C(continent)', data=gapminder_2_by_2).fit()
fit
fit.summary()
= sm.stats.anova_lm(fit, typ=2)
lm_4_continents_aov_table lm_4_continents_aov_table
manual calculation of f-value for 2 x 2
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
= mean(gapminder_2_by_2$lifeExp)
overallMeanLifeExp = sum((gapminder_2_by_2$lifeExp - mean(gapminder_2_by_2$lifeExp))^2)
totalVar %>%
gapminder_2_by_2 group_by(continent, year) %>%
summarise(
mean_lifeExp = mean(lifeExp),
countries = length(lifeExp),
betweenSS = countries * ((overallMeanLifeExp - mean_lifeExp)^2)
-> year_continent_means )
`summarise()` has grouped output by 'continent'. You can override using the
`.groups` argument.
year_continent_means
# A tibble: 4 × 5
# Groups: continent [2]
continent year mean_lifeExp countries betweenSS
<fct> <int> <dbl> <int> <dbl>
1 Africa 2002 53.3 52 4396.
2 Africa 2007 54.8 52 3094.
3 Europe 2002 76.7 30 6033.
4 Europe 2007 77.6 30 6866.
sum(year_continent_means$betweenSS)
[1] 20389.47
totalVar
[1] 30311.89
sum(year_continent_means$betweenSS)/totalVar
[1] 0.6726556
sum(year_continent_means$betweenSS))/totalVar (
[1] 0.6726556
<- length(gapminder_2_by_2$country) - 1
df_total <- length(gapminder_2_by_2$country) -
df_res 1 - #data points
3 # predictors
= totalVar - sum(year_continent_means$betweenSS)
ss_res
1 - (ss_res/df_res)/(totalVar/df_total)
[1] 0.6665179
##
# break down of types of variance
##
<- gapminder_2_by_2 %>%
continent_df group_by(continent) %>%
summarise(
mean_lifeExp = mean(lifeExp),
countries = length(lifeExp),
betweenSS = countries * ((overallMeanLifeExp - mean_lifeExp)^2)
)
sum(continent_df$betweenSS)
[1] 20318.97
<- gapminder_2_by_2 %>%
year_df group_by(year) %>%
summarise(
mean_lifeExp = mean(lifeExp),
countries = length(lifeExp),
betweenSS = countries * ((overallMeanLifeExp - mean_lifeExp)^2)
)
sum(year_df$betweenSS)
[1] 67.79278
##
# interaction
##
sum(year_continent_means$betweenSS) - sum(continent_df$betweenSS) - sum(year_df$betweenSS)
[1] 2.70036
gapminder
# A tibble: 1,704 × 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ℹ 1,694 more rows
20319 +
68 +
3
[1] 20390
from siuba import group_by, summarize, _
= gapminder_2_by_2['lifeExp'].mean()
overallMeanLifeExp
= sum((gapminder_2_by_2['lifeExp'] - gapminder_2_by_2['lifeExp'].mean())**2)
totalVar
=(gapminder_2_by_2
year_continent_means>> group_by(_.continent, _.year)
>> summarize(mean_lifeExp = _.lifeExp.mean(),
= _.lifeExp.count())
countries
)'betweenSS'] = year_continent_means['countries'] * ((overallMeanLifeExp - year_continent_means['mean_lifeExp'])**2)
year_continent_means[
print(tabulate(year_continent_means, headers=year_continent_means.head(), tablefmt="fancy_grid",showindex=False))
sum(year_continent_means['betweenSS'])
totalVar
sum(year_continent_means['betweenSS'])/totalVar
= len(gapminder_2_by_2['country']) - 1
df_total = len(gapminder_2_by_2['country']) - 1 - 3
df_res = totalVar - sum(year_continent_means['betweenSS'])
ss_res 1 - (ss_res/df_res)/(totalVar/df_total)
=(gapminder_2_by_2
continent_df>> group_by(_.continent)
>> summarize(mean_lifeExp = _.lifeExp.mean(),
= _.lifeExp.count())
countries
)'betweenSS'] = continent_df['countries'] * ((overallMeanLifeExp - continent_df['mean_lifeExp'])**2)
continent_df[sum(continent_df['betweenSS'])
=(gapminder_2_by_2
year_df>> group_by(_.year)
>> summarize(mean_lifeExp = _.lifeExp.mean(),
= _.lifeExp.count())
countries
)'betweenSS'] = year_df['countries'] * ((overallMeanLifeExp - year_df['mean_lifeExp'])**2)
year_df[sum(year_df['betweenSS'])
sum(year_continent_means['betweenSS']) - sum(continent_df['betweenSS']) - sum(year_df['betweenSS'])
print(tabulate(gapminder[:10], headers=gapminder.head(), tablefmt="fancy_grid",showindex=False))
20319 +68 +3
3 way ANOVA
$pop_split = "high"
gapminder_2_by_2$pop_split[gapminder_2_by_2$pop < median(gapminder_2_by_2$pop)] = "low"
gapminder_2_by_2
%>%
gapminder_2_by_2 group_by(continent, year, pop_split) %>%
summarise(
mean_lifeExp = mean(lifeExp),
countries = length(lifeExp),
betweenSS = countries * ((overallMeanLifeExp - mean_lifeExp)^2)
-> three_way_summary )
`summarise()` has grouped output by 'continent', 'year'. You can override using
the `.groups` argument.
sum(three_way_summary$betweenSS)
[1] 20403.63
summary(lm(lifeExp ~ factor(year) * factor(continent), gapminder_2_by_2))
Call:
lm(formula = lifeExp ~ factor(year) * factor(continent), data = gapminder_2_by_2)
Residuals:
Min 1Q Median 3Q Max
-15.1930 -4.7759 -0.1898 3.1180 22.4188
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 53.3252 1.0921 48.830 <2e-16
factor(year)2007 1.4808 1.5444 0.959 0.339
factor(continent)Europe 23.3754 1.8055 12.947 <2e-16
factor(year)2007:factor(continent)Europe -0.5328 2.5533 -0.209 0.835
(Intercept) ***
factor(year)2007
factor(continent)Europe ***
factor(year)2007:factor(continent)Europe
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.875 on 160 degrees of freedom
Multiple R-squared: 0.6727, Adjusted R-squared: 0.6665
F-statistic: 109.6 on 3 and 160 DF, p-value: < 2.2e-16
summary(aov(lifeExp ~ factor(year) * factor(continent), gapminder_2_by_2))
Df Sum Sq Mean Sq F value Pr(>F)
factor(year) 1 68 68 1.093 0.297
factor(continent) 1 20319 20319 327.645 <2e-16 ***
factor(year):factor(continent) 1 3 3 0.044 0.835
Residuals 160 9922 62
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(lifeExp ~ factor(year) * factor(continent) * factor(pop_split), gapminder_2_by_2))
Df Sum Sq Mean Sq F value
factor(year) 1 68 68 1.067
factor(continent) 1 20319 20319 319.911
factor(pop_split) 1 13 13 0.212
factor(year):factor(continent) 1 3 3 0.046
factor(year):factor(pop_split) 1 0 0 0.000
factor(continent):factor(pop_split) 1 0 0 0.007
factor(year):factor(continent):factor(pop_split) 1 0 0 0.000
Residuals 156 9908 64
Pr(>F)
factor(year) 0.303
factor(continent) <2e-16 ***
factor(pop_split) 0.646
factor(year):factor(continent) 0.830
factor(year):factor(pop_split) 0.989
factor(continent):factor(pop_split) 0.931
factor(year):factor(continent):factor(pop_split) 0.988
Residuals
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(lifeExp ~ factor(year) * factor(continent) * factor(pop_split), gapminder_2_by_2))
Call:
lm(formula = lifeExp ~ factor(year) * factor(continent) * factor(pop_split),
data = gapminder_2_by_2)
Residuals:
Min 1Q Median 3Q Max
-15.5189 -4.8868 -0.3127 3.1208 22.0864
Coefficients:
Estimate
(Intercept) 52.96628
factor(year)2007 1.53805
factor(continent)Europe 23.52019
factor(pop_split)low 0.69131
factor(year)2007:factor(continent)Europe -0.59779
factor(year)2007:factor(pop_split)low -0.06377
factor(continent)Europe:factor(pop_split)low -0.26305
factor(year)2007:factor(continent)Europe:factor(pop_split)low 0.07923
Std. Error
(Intercept) 1.59392
factor(year)2007 2.21201
factor(continent)Europe 2.60286
factor(pop_split)low 2.21201
factor(year)2007:factor(continent)Europe 3.65535
factor(year)2007:factor(pop_split)low 3.12825
factor(continent)Europe:factor(pop_split)low 3.65535
factor(year)2007:factor(continent)Europe:factor(pop_split)low 5.16944
t value Pr(>|t|)
(Intercept) 33.230 < 2e-16
factor(year)2007 0.695 0.488
factor(continent)Europe 9.036 5.91e-16
factor(pop_split)low 0.313 0.755
factor(year)2007:factor(continent)Europe -0.164 0.870
factor(year)2007:factor(pop_split)low -0.020 0.984
factor(continent)Europe:factor(pop_split)low -0.072 0.943
factor(year)2007:factor(continent)Europe:factor(pop_split)low 0.015 0.988
(Intercept) ***
factor(year)2007
factor(continent)Europe ***
factor(pop_split)low
factor(year)2007:factor(continent)Europe
factor(year)2007:factor(pop_split)low
factor(continent)Europe:factor(pop_split)low
factor(year)2007:factor(continent)Europe:factor(pop_split)low
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.97 on 156 degrees of freedom
Multiple R-squared: 0.6731, Adjusted R-squared: 0.6585
F-statistic: 45.89 on 7 and 156 DF, p-value: < 2.2e-16
20319 +
68 +
3 +
13
[1] 20403
20403.63
[1] 20403.63
'pop_split']=0
gapminder_2_by_2[
'pop_split'].loc[gapminder_2_by_2['pop']<np.median(gapminder_2_by_2['pop'])]="low"
gapminder_2_by_2['pop_split'].loc[gapminder_2_by_2['pop']>=np.median(gapminder_2_by_2['pop'])]="high"
gapminder_2_by_2[
= (gapminder_2_by_2
three_way_summary >> group_by(_.continent, _.year, _.pop_split)
>> summarize(mean_lifeExp = _.lifeExp.mean(),
= _.lifeExp.count())
countries
)'betweenSS'] = three_way_summary['countries'] * ((overallMeanLifeExp - three_way_summary['mean_lifeExp'])**2)
three_way_summary[sum(three_way_summary['betweenSS'])
= ols('lifeExp ~ C(year) + C(continent) + C(pop_split)', data=gapminder_2_by_2).fit()
fit2
fit2.summary()