Stanford STATS191 in Python, Lecture 7 : Interactions and qualitative variables (Part 2)
Stanford Stats 191¶
Introduction¶
This is a re-creation of the Stanford Stats 191 course (see https://web.stanford.edu/class/stats191/), using Python eco-system tools, instead of R. This is lecture "Interactions and qualitative variables", Part 2
Initial Notebook Setup¶
watermark
documents the current Python and package environment, black
is my preferred Python formatter
%load_ext watermark
%load_ext lab_black
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.formula.api import rlm
import statsmodels.api as sm
import warnings
Load and Explore Example Dataset¶
Variable | Description |
---|---|
TEST | Job aptitude test score |
MINORITY | 1 if applicant could be considered minority, 0 otherwise |
PERF | Job performance evaluation |
data = pd.read_csv('../data/jobtest.txt', sep='\t')
# sort the Test Score data to make graphic easier
data = data.sort_values(by='TEST').copy()
data.head()
data['MINORITY'].unique()
We change the color and symbol used to distinguish observations with different Minority status (we only need two different colors and two different marker symbols)
fig, ax = plt.subplots(figsize=(8, 8))
sn.scatterplot(
'TEST',
'JPERF',
data=data,
hue='MINORITY',
ax=ax,
alpha=0.99,
s=100,
markers=['o', 'v'],
palette=['purple', 'green'],
style='MINORITY',
)
Model 1, TEST Only¶
First, we create a model where Job Performance depends only on the Test Scores (equivalent to beta-2, beta-3 = 0 in the equation above)
res1 = ols('JPERF ~ TEST', data=data).fit()
res1.summary()
Display the line of best fit (not looking as if this model is wonderful!)
fig, ax = plt.subplots(figsize=(8, 8))
sn.scatterplot(
'TEST',
'JPERF',
data=data,
hue='MINORITY',
ax=ax,
alpha=0.99,
s=200,
markers=['o', 'v'],
palette=['purple', 'green'],
style='MINORITY',
)
y1 = res1.predict()
ax.plot(data['TEST'], y1, 'r-', label='Predicted')
ax.grid()
ax.legend(loc='best')
Display the residuals. At least they appear to have constant variance
plt.plot(data['TEST'], res1.resid, 'o')
plt.axhline(0, color='r')
Model 2, TEST and Minority Status¶
We now build a model where Job Performamnce depends upon Test Scores and Minority Status. This maodel says we have the same slope of the JPERF
- TEST
line for each Minority Status (but each Minority Status line has a different intercept)
This is equivalent to setting beta-3 = 0 in our general equation above
res2 = ols('JPERF ~ TEST + MINORITY', data=data).fit()
res2.summary()
Plotting the two lines our model gives us look OK, but the residuals plot does not look to be significantly better
fig, ax = plt.subplots(figsize=(8, 8))
sn.scatterplot(
'TEST',
'JPERF',
data=data,
hue='MINORITY',
ax=ax,
alpha=0.99,
s=200,
markers=['o', 'v'],
palette=['purple', 'green'],
style='MINORITY',
)
data['PREDICT'] = res2.predict()
mask0 = data['MINORITY'] == 0
mask1 = data['MINORITY'] == 1
y1 = data[mask0]['PREDICT']
y2 = data[mask1]['PREDICT']
ax.plot(
data[mask0]['TEST'],
y1,
color='purple',
label='Predicted (M=0)',
)
ax.plot(
data[mask1]['TEST'], y2, 'g-', label='Predicted (M=1)'
)
ax.grid()
ax.legend(loc='best')
plt.plot(data['TEST'], res2.resid, 'o')
plt.axhline(0, color='r')
Model 3, TEST with MINORITY Interaction¶
In our third model, we allow each Minority Status group to have diffent slope but same intercept in the TEST
- JPERF
line, equivalent to beta-2 = 0
res3 = ols('JPERF ~ TEST + TEST:MINORITY', data=data).fit()
res3.summary()
Subjectively, this is a better model (especially for Minority Status = 0 observations). However, the beauty of statistics is that we don't have to rely on subjective assessment!
The code below add the TEST
= 0 point to each line, to show they have the same intercept
fig, ax = plt.subplots(figsize=(8, 8))
sn.scatterplot(
'TEST',
'JPERF',
data=data,
hue='MINORITY',
ax=ax,
alpha=0.99,
s=200,
markers=['o', 'v'],
palette=['purple', 'green'],
style='MINORITY',
)
data['PREDICT'] = res3.predict()
mask0 = data['MINORITY'] == 0
mask1 = data['MINORITY'] == 1
y1 = data[mask0]['PREDICT']
y2 = data[mask1]['PREDICT']
ax.plot(
[0] + list(data[mask0]['TEST']),
[res3.params['Intercept']] + list(y1),
'b-',
label='Predicted (M=0)',
)
ax.plot(
[0] + list(data[mask1]['TEST']),
[res3.params['Intercept']] + list(y2),
'y-',
label='Predicted (M=1)',
)
ax.grid()
ax.legend(loc='best')
We plot the residuals
plt.plot(data['TEST'], res3.resid, 'o')
plt.axhline(0, color='r')
Model 4, All Possible Contributions to Linear Model¶
Our last model has no constraints on the beta values
#
# * has special meaning in ```ols``` formulas
res4 = ols('JPERF ~ TEST * MINORITY', data=data).fit()
res4.summary()
fig, ax = plt.subplots(figsize=(8, 8))
sn.scatterplot(
'TEST',
'JPERF',
data=data,
hue='MINORITY',
ax=ax,
alpha=0.99,
s=200,
markers=['o', 'v'],
palette=['purple', 'green'],
style='MINORITY',
)
data['PREDICT'] = res4.predict()
mask0 = data['MINORITY'] == 0
mask1 = data['MINORITY'] == 1
y1 = data[mask0]['PREDICT']
y2 = data[mask1]['PREDICT']
ax.plot(
list(data[mask0]['TEST']),
list(y1),
'b-',
label='Predicted (M=0)',
)
ax.plot(
list(data[mask1]['TEST']),
list(y2),
'y-',
label='Predicted (M=1)',
)
ax.grid()
ax.legend(loc='best')
plt.plot(data['TEST'], res4.resid, 'o')
It is interesting to see the Mean Square Error in the Residuals for each model
res = [res1, res2, res3, res4]
fig, ax = plt.subplots(figsize=(8, 8))
sn.barplot(
['T', 'T+M', 'T+T:M', 'T*M'],
[a.mse_resid for a in res],
ax=ax,
)
ax.set_xlabel('Model Formula')
ax.set_ylabel('Mean Square Residuals')
Analysis of Variance¶
We now compare each model against another, to see if there is a significantly better explanation for the variance
TEST vs TEST * MINORITY¶
Not significant at the 5% level
with warnings.catch_warnings():
warnings.simplefilter("ignore")
anv = sm.stats.anova_lm(res1, res4)
# end with
anv
TEST vs TEST + MINORITY¶
Not significant
sm.stats.anova_lm(res1, res2)
TEST + TEST:MINORITY vs TEST * MINORITY¶
Not significant
sm.stats.anova_lm(res3, res4)
TEST vs TEST + TEST:MINORITY¶
Significant at the 5% level
sm.stats.anova_lm(res1, res3)
TEST + MINORITY vs TEST:MINORITY¶
Not significant at the 5% level
sm.stats.anova_lm(res2, res4)
Finally we plot the residuals of all the models together
# ['T', 'T+M', 'T+T:M', 'T*M']
_ = plt.plot(range(len(data)), res1.resid, 'ro', label='T')
_ = plt.plot(
range(len(data)), res2.resid, 'g+', label='T+M'
)
_ = plt.plot(
range(len(data)), res3.resid, 'kd', label='T+T:M'
)
_ = plt.plot(
range(len(data)), res4.resid, 'b*', label='T*M'
)
_ = plt.legend(loc='best')
Environment¶
%watermark -h -iv
%watermark