In [13]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
In [28]:
weekday = ['sat', 'sat', 'sat', 'sat', 'sat', 'sat', 'sun', 'sun', 'sun', 'sun']
weather = ['rain', 'rain', 'rain', 'rain', 'rain', 'sun', 'sun', 'sun', 'sun', 'sun']
sales = [100, 100, 100, 100, 100, 10000, 10000, 10000, 10000, 10000]
In [33]:
data = pd.DataFrame({'weekday': weekday, 'weather': weather, 'sales': sales})
data
Out[33]:
weekday weather sales
0 sat rain 100
1 sat rain 100
2 sat rain 100
3 sat rain 100
4 sat rain 100
5 sat sun 10000
6 sun sun 10000
7 sun sun 10000
8 sun sun 10000
9 sun sun 10000

In this example:

  • weather is clearly a better predictor of sales than weekday
  • there is clearly no interaction effect Let's see what the different types of sums of squares give:
In [34]:
# Type I tells us that weekday is more important. The interaction effect is not signifcant.
lm = ols('sales ~ C(weekday)*C(weather)',data=data).fit()
table = sm.stats.anova_lm(lm, typ=1) # Type 1 ANOVA DataFrame
print(table)
                        df        sum_sq       mean_sq             F  \
C(weekday)             1.0  1.633500e+08  1.633500e+08  3.821192e+31   
C(weather)             1.0  8.167500e+07  8.167500e+07  1.910596e+31   
C(weekday):C(weather)  1.0  7.126759e-24  7.126759e-24  1.667139e+00   
Residual               7.0  2.992390e-23  4.274844e-24           NaN   

                              PR(>F)  
C(weekday)             7.657359e-109  
C(weather)             8.663313e-108  
C(weekday):C(weather)   2.376431e-01  
Residual                         NaN  
In [31]:
# Type II tells us that weather is more important. There is no interaction effect.
lm = ols('sales ~ C(weekday) + C(weather)',data=data).fit()
table = sm.stats.anova_lm(lm, typ=2) # Type 2 ANOVA DataFrame
print(table)
                  sum_sq   df             F         PR(>F)
C(weekday)  5.972244e-23  1.0  1.735279e+00   2.292173e-01
C(weather)  8.167500e+07  1.0  2.373127e+30  1.282780e-104
Residual    2.409164e-22  7.0           NaN            NaN
In [35]:
# Type III tells us that weekday is more important. The interaction effect is not signifcant.
lm = ols('sales ~ C(weekday)*C(weather)',data=data).fit()
table = sm.stats.anova_lm(lm, typ=3) # Type 3 ANOVA DataFrame
print(table)
                             sum_sq   df             F         PR(>F)
Intercept              5.000000e+04  1.0  1.169633e+28   1.526163e-96
C(weekday)             9.799326e-23  1.0  2.292324e+01   1.994151e-03
C(weather)             8.167500e+07  1.0  1.910596e+31  8.663313e-108
C(weekday):C(weather)  2.646978e-24  1.0  6.191988e-01   4.571545e-01
Residual               2.992390e-23  7.0           NaN            NaN