Stanford STATS191 in Python, Lecture 9 : Transformations and Weighted Least Squares
Stanford Stats 191¶
Introduction¶
This is a re-creation of the Stanford Stats 191 course (see https://web.stanford.edu/class/stats191/), using Python eco-system tools, instead of R. This is lecture "Transformations and Weighted Least Squares"
Initial Notebook Setup¶
watermark
documents the current Python and package environment, black
is my preferred Python formatter
%load_ext watermark
%load_ext lab_black
%matplotlib inline
All import got here (not all are used in this Notebook)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.formula.api import rlm
from statsmodels.formula.api import mixedlm
from statsmodels.formula.api import wls
import statsmodels.api as sm
from patsy import dmatrices
from patsy.contrasts import Treatment
from patsy.contrasts import Sum
from patsy.contrasts import Diff
import warnings
Nonlinear Models¶
We look at a dataset where a linear model is not appropriate
Load and Explore Dataset¶
data = pd.read_csv('../data/bacteria.txt', sep='\t')
data.head()
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(data['t'], data['N_t'], 'd')
ax.set_xlabel('Time')
ax.set_ylabel('Count')
Fit a Linear Model¶
res1 = ols('N_t ~ t', data=data).fit()
res1.summary()
Visualize the fit of our linear model to the data
fig, ax = plt.subplots(figsize=(8, 8))
ax.plot(data['t'], data['N_t'], 'd', label='Data')
ax.plot(
data['t'],
res1.predict(),
'r-',
label='Linear Prediction',
)
ax.set_xlabel('Time')
ax.set_ylabel('Count')
ax.legend(loc='best')
Regression Diagnostics¶
The STATS191 lecture notes have quartet of plots that can be used to assess linear model quality. We define a function to show these, given the RegressionResults object from a ols
model fit
def show_regression_diagnostic_plots(res1):
'''
show_regression_diagnostic_plots: show four plots to assess validity of linear regression
Refer to https://web.stanford.edu/class/stats191/notebooks/Transformations_WLS.html for examples
that guided this function
Parameters
res1: statsmodels RegressionResults value
Returns
matplotlib Figure object containing plots (one in each Axes object linked to Figure object)
Notes
In a Jupyter Notebook, calling this function will display a Figure, and also return a Figure
Just a naked call will result in the plots being displayed twice. To prevent this, assign
the function call return to a variable
'''
fig, ax = plt.subplots(
nrows=2,
ncols=2,
figsize=(10, 10),
constrained_layout=True,
)
# create a qq plot, to assess how (if) the residuals are distributed
axr1c2 = ax[0, 1]
sm.qqplot(res1.resid, line='s', ax=axr1c2)
axr1c2.set_title('Normal QQ Plot')
# plot residuals against fitted value, to see if residuals are correlated with exog variable
# (i.e. are not normally distributed)
axr1c1 = ax[0, 0]
axr1c1.plot(res1.predict(), res1.resid, 'rd')
axr1c1.set_xlabel('Fitted Values')
axr1c1.set_ylabel('Residuals')
axr1c1.axhline(0, color='k')
axr1c1.set_title('Residuals vs Fitted')
# plot standardizerd residuals against leverage
axr2c2 = ax[1, 1]
influence = res1.get_influence()
hat = influence.hat_matrix_diag
standardized_residuals_int = (
influence.resid_studentized_internal
)
axr2c2.plot(hat, standardized_residuals_int, 'd')
axr2c2.set_xlabel('Leverage')
axr2c2.set_ylabel('Standardised Residuals')
axr2c2.set_title('Residuals vs Leverage')
axr2c1 = ax[1, 0]
standardized_residuals_ext = (
influence.resid_studentized_external
)
zz = np.sqrt(np.abs(standardized_residuals_ext))
axr2c1.plot(res1.predict(), zz, 'rd', label='Data')
axr2c1.set_xlabel('Fitted Values')
axr2c1.set_ylabel(
'sqrt(abs(Externally Studentized Residual))'
)
axr2c1.set_title('Scale-Location')
return fig
# end
fig = show_regression_diagnostic_plots(res1)
We see that residuals are clearly not distributed uniformly (Residuals vs Fitted graph).
Further, if we look at the statsmodels
influence and leverage graphs, we see a clear outlier
fig, ax = plt.subplots(figsize=(6, 6))
_ = sm.graphics.influence_plot(res1, ax=ax)
fig, ax = plt.subplots(figsize=(6, 6))
_ = sm.graphics.plot_leverage_resid2(res1, ax=ax)
Fit Exponential¶
At this stage we decide that a better model might be that of exponential decay, where the log of N_t
might be linear in time.
Note that we can use numpy
functions inside the ols
formula
res2 = ols('np.log(N_t) ~ t', data=data).fit()
res2.summary()
Assess Exponential Model¶
Plotting the model predictions against actuals in two different ways, we can see we have a much better fit (also the R^2 value is much closer to 1.0)
fig, ax = plt.subplots(figsize=(6, 6))
ax.plot(data['t'], data['N_t'], 'o', label='Data')
ax.plot(
data['t'],
np.exp(res2.predict()),
'g-',
label='Exp. Fit',
)
ax.plot(data['t'], res1.predict(), 'r-', label='Linear Fit')
ax.legend(loc='best')
ax.set_xlabel('Time')
ax.set_ylabel('Count')
fig, ax = plt.subplots(figsize=(6, 6))
ax.plot(data['t'], np.log(data['N_t']), 'o', label='Data')
ax.plot(data['t'], (res2.predict()), 'g-', label='Exp. Fit')
ax.plot(
data['t'],
np.log(res1.predict()),
'r-',
label='Linear Fit',
)
ax.legend(loc='best')
ax.set_xlabel('Time')
ax.set_ylabel('Log Count (ln)')
The Diagnostic Graphs look better too (especially Residuals vs Fitted)
fig = show_regression_diagnostic_plots(res2)
fig, ax = plt.subplots(figsize=(6, 6))
_ = sm.graphics.influence_plot(res2, ax=ax)
fig, ax = plt.subplots(figsize=(6, 6))
_ = sm.graphics.plot_leverage_resid2(res2, ax=ax)
Non-Constant Variance¶
We now look at a dataset, with an error term that is not constant from observation to observation (see below for description of the dataset)
Variable | Description |
---|---|
Y | Per capita education expenditure by state X1 | Per capita income in 1973 by state X2 | Proportion of population under 18 X3 | Proportion in urban areas Region| Which region of the country are the states located in
Read and Explore Dataset¶
data2 = pd.read_csv('../data/education1975.txt', sep='\t')
data2.head()
Annoyingly, we have trailing spaces in the column names, so lets fix those
data2.columns
new_names = [n.strip() for n in data2.columns]
data2.columns = new_names
data2.head(1)
data2.describe()
Fit Linear Model¶
res3 = ols('Y ~ X1 + X2 + X3', data=data2).fit()
res3.summary()
Assess Model¶
The Residuals vs Fitted graph shows a clear increase in residual, as the fitted value increases (i.e. variance of the error term is not constant)
fig = show_regression_diagnostic_plots(res3)
Further, we appear to have an outlier observation (the State of Alaska, code AK)
fig, ax = plt.subplots(figsize=(6, 6))
_ = sm.graphics.influence_plot(res3, ax=ax)
data2.iloc[48]
fig, ax = plt.subplots(figsize=(6, 6))
_ = sm.graphics.plot_leverage_resid2(res3, ax=ax)
We now visualize the residuals (standardized) by region, and it appears that each region has a different variance of the error term
fig, ax = plt.subplots(figsize=(6, 6))
influence = res3.get_influence()
standardized_residuals_int = (
influence.resid_studentized_internal
)
sns.boxplot(
'Region', standardized_residuals_int, data=data2, ax=ax
)
ax.set_ylabel('Standardized Residuals')
ax.set_title('Standardized Residuals by Region')
Annoyingly, we find the state codes also have a trailing space, so fix these
'*' + data2['STATE'].iloc[48] + '*'
data2['STATE'] = data2['STATE'].str.strip()
'*' + data2['STATE'].iloc[48] + '*'
We now clean the dataset, by excluding the state of Alaska, and making a copy of the DataFrame.
We then fit a linear model again
data2_clean = data2[data2['STATE'] != 'AK'].copy()
print(f'Clean data items {len(data2_clean)}')
res4 = ols('Y ~ X1 + X2 + X3', data=data2_clean).fit()
res4.summary()
Assess Model¶
Cleaning the data didn't fix our non_constant variance residuals
_ = show_regression_diagnostic_plots(res4)
Residuals by Region still show inter-Region difference
fig, ax = plt.subplots(figsize=(6, 6))
influence = res4.get_influence()
standardized_residuals_int = (
influence.resid_studentized_internal
)
sns.boxplot(
'Region',
standardized_residuals_int,
data=data2_clean,
ax=ax,
)
ax.set_title('Standardized Residuals by Region')
Weighted Least Squares¶
We define a new model, where we standardized variance by weighting with value of Y (assumes a model of error term as N(0,eps * Y). We assume observations within each Region has a constant variance error term
weights
array will hold the (1/variance) of the error term (in our model)
weights = np.zeros(len(data2_clean))
Get list of unique Region codes
regions = data2_clean['Region'].unique()
regions
For each Region, compute an estimate of the variance of the error term for that region, and fill in the weights for all observations from that region
for r in regions:
region_mask = data2_clean['Region'] == r
region_count = sum(region_mask)
value = sum(res4.resid[region_mask] ** 2) / sum(
region_mask
)
value = 1.0 / value
weights[region_mask] = value
print(value)
# end for
Note that the value of the weights above agree with the STATS191 lecture notes
Fit Model with Weights¶
res5 = wls(
'Y ~ X1 + X2 + X3', data=data2_clean, weights=weights
).fit()
res5.summary()
The results above agree with the STATS191 results in the first two decimal places (I suspect the warning:
The condition number is large, 1.05e+05. This might indicate that there are strong multicollinearity or other numerical problems.
should be heeded
To check the weights, you can get them via the model
attribute of the RegressionResults
res5.model.weights[0:5]
It appears that after fitting a Weighted Least Squares model, Pearson Residuals are a better way of standardising residuals. The boxplot below shows that we have accounted for much of the difference between Regions in our model, but not all.
fig, ax = plt.subplots(figsize=(6, 6))
sns.boxplot(
'Region', res5.resid_pearson, data=data2_clean, ax=ax
)
ax.set_title('Pearson Residuals by Region')
ax.set_ylabel('Pearson Residuals')
The Residuals vs Fitted plots are very much the same
def show_regression_diagnostic_plots_wls(res1):
'''
show_regression_diagnostic_plots_wls: show two plots to assess validity of linear regression
Refer to https://web.stanford.edu/class/stats191/notebooks/Transformations_WLS.html for examples
that guided this function
Parameters
res1: statsmodels RegressionResults value
Returns
matplotlib Figure object containing plots (one in each Axes object linked to Figure object)
Notes
In a Jupyter Notebook, calling this function will display a Figure, and also return a Figure
Just a naked call will result in the plots being displayed twice. To prevent this, assign
the function call return to a variable
This is intended to be called after a wls().fit() call. No influence plots will be created,
because these are not supported by statsmodels
Quote: The influence and outlier statistics have not been extended to other models [including wls]
mostly for a lack of reference to the statistical literature that explicitly handles
this case, and for not knowing of a reference implementation in another package that
could be used for the unit tests.
'''
fig, ax = plt.subplots(
nrows=1,
ncols=2,
figsize=(10, 5),
constrained_layout=True,
)
# create a qq plot, to assess how (if) the residuals are distributed
axr1c2 = ax[1]
sm.qqplot(res1.resid, line='s', ax=axr1c2)
axr1c2.set_title('Normal QQ Plot')
# plot residuals against fitted value, to see if residuals are correlated with exog variable
# (i.e. are not normally distributed)
axr1c1 = ax[0]
axr1c1.plot(res1.predict(), res1.resid, 'rd')
axr1c1.set_xlabel('Fitted Values')
axr1c1.set_ylabel('Residuals')
axr1c1.axhline(0, color='k')
axr1c1.set_title('Residuals vs Fitted')
return fig
# end
_ = show_regression_diagnostic_plots_wls(res5)
Environment¶
%watermark -h -iv
%watermark
sm.show_versions()