Engel Curves: A Classic Quantile Regression Example¶
This notebook demonstrates quantile regression on Ernst Engel's 1857 data on food expenditure vs. household income. We fit multiple quantiles to reveal heteroscedasticity — wealthier households show much more variability in food spending.
In [ ]:
Copied!
import numpy as np
import matplotlib.pyplot as plt
from pinball import QuantileRegressor, summary
from pinball.datasets import load_engel
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
import numpy as np
import matplotlib.pyplot as plt
from pinball import QuantileRegressor, summary
from pinball.datasets import load_engel
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
Load the Data¶
In [ ]:
Copied!
engel = load_engel()
X, y = engel.data, engel.target
income = X.ravel()
print(f'n = {len(y)}, features = {engel.feature_names}')
print(f'Income range: [{income.min():.0f}, {income.max():.0f}]')
print(f'Expenditure range: [{y.min():.0f}, {y.max():.0f}]')
engel = load_engel()
X, y = engel.data, engel.target
income = X.ravel()
print(f'n = {len(y)}, features = {engel.feature_names}')
print(f'Income range: [{income.min():.0f}, {income.max():.0f}]')
print(f'Expenditure range: [{y.min():.0f}, {y.max():.0f}]')
Scatter Plot with OLS¶
In [ ]:
Copied!
from sklearn.linear_model import LinearRegression
ols = LinearRegression().fit(X, y)
x_grid = np.linspace(income.min(), income.max(), 200).reshape(-1, 1)
plt.scatter(income, y, alpha=0.4, s=20, label='Data')
plt.plot(x_grid, ols.predict(x_grid), 'k--', lw=2, label='OLS (mean)')
plt.xlabel('Household Income')
plt.ylabel('Food Expenditure')
plt.title('Engel Data \u2014 OLS captures only the mean')
plt.legend()
plt.show()
from sklearn.linear_model import LinearRegression
ols = LinearRegression().fit(X, y)
x_grid = np.linspace(income.min(), income.max(), 200).reshape(-1, 1)
plt.scatter(income, y, alpha=0.4, s=20, label='Data')
plt.plot(x_grid, ols.predict(x_grid), 'k--', lw=2, label='OLS (mean)')
plt.xlabel('Household Income')
plt.ylabel('Food Expenditure')
plt.title('Engel Data \u2014 OLS captures only the mean')
plt.legend()
plt.show()
Fit Multiple Quantiles¶
Quantile regression reveals how the entire distribution of food expenditure shifts with income.
In [ ]:
Copied!
taus = [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]
colours = plt.cm.RdYlBu_r(np.linspace(0.1, 0.9, len(taus)))
plt.figure(figsize=(10, 6))
plt.scatter(income, y, alpha=0.3, s=15, color='grey')
for tau, col in zip(taus, colours):
model = QuantileRegressor(tau=tau, method='fn')
model.fit(X, y)
y_hat = model.predict(x_grid)
plt.plot(x_grid, y_hat, color=col, lw=2,
label=f'tau={tau:.2f} (slope={model.coef_[0]:.3f})')
plt.xlabel('Household Income')
plt.ylabel('Food Expenditure')
plt.title('Quantile Regression Fan \u2014 Engel Data')
plt.legend(loc='upper left', fontsize=9)
plt.show()
taus = [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]
colours = plt.cm.RdYlBu_r(np.linspace(0.1, 0.9, len(taus)))
plt.figure(figsize=(10, 6))
plt.scatter(income, y, alpha=0.3, s=15, color='grey')
for tau, col in zip(taus, colours):
model = QuantileRegressor(tau=tau, method='fn')
model.fit(X, y)
y_hat = model.predict(x_grid)
plt.plot(x_grid, y_hat, color=col, lw=2,
label=f'tau={tau:.2f} (slope={model.coef_[0]:.3f})')
plt.xlabel('Household Income')
plt.ylabel('Food Expenditure')
plt.title('Quantile Regression Fan \u2014 Engel Data')
plt.legend(loc='upper left', fontsize=9)
plt.show()
Interpretation¶
The slopes diverge: higher quantiles have steeper slopes. This means:
- At the 10th percentile, each extra unit of income increases food expenditure modestly.
- At the 90th percentile, the same income increment raises food expenditure much more.
OLS misses this entirely \u2014 it reports a single slope for the conditional mean.
Inference: Standard Errors and Confidence Intervals¶
In [ ]:
Copied!
model = QuantileRegressor(tau=0.5, method='fn')
model.fit(X, y)
result = summary(model, X, y, se='nid')
print(result)
model = QuantileRegressor(tau=0.5, method='fn')
model.fit(X, y)
result = summary(model, X, y, se='nid')
print(result)
Coefficient Comparison Across Quantiles¶
In [ ]:
Copied!
slopes = []
ci_lo = []
ci_hi = []
for tau in taus:
model = QuantileRegressor(tau=tau, method='fn')
model.fit(X, y)
result = summary(model, X, y, se='nid')
slopes.append(result.coefficients[1])
ci_lo.append(result.conf_int[1, 0])
ci_hi.append(result.conf_int[1, 1])
slopes = np.array(slopes)
ci_lo = np.array(ci_lo)
ci_hi = np.array(ci_hi)
plt.figure(figsize=(8, 5))
plt.fill_between(taus, ci_lo, ci_hi, alpha=0.2, color='steelblue')
plt.plot(taus, slopes, 'o-', color='steelblue', lw=2, label='QR slope')
plt.axhline(ols.coef_[0], color='black', ls='--', label='OLS slope')
plt.xlabel('Quantile (tau)')
plt.ylabel('Slope (income coefficient)')
plt.title('Income Coefficient Across Quantiles')
plt.legend()
plt.show()
slopes = []
ci_lo = []
ci_hi = []
for tau in taus:
model = QuantileRegressor(tau=tau, method='fn')
model.fit(X, y)
result = summary(model, X, y, se='nid')
slopes.append(result.coefficients[1])
ci_lo.append(result.conf_int[1, 0])
ci_hi.append(result.conf_int[1, 1])
slopes = np.array(slopes)
ci_lo = np.array(ci_lo)
ci_hi = np.array(ci_hi)
plt.figure(figsize=(8, 5))
plt.fill_between(taus, ci_lo, ci_hi, alpha=0.2, color='steelblue')
plt.plot(taus, slopes, 'o-', color='steelblue', lw=2, label='QR slope')
plt.axhline(ols.coef_[0], color='black', ls='--', label='OLS slope')
plt.xlabel('Quantile (tau)')
plt.ylabel('Slope (income coefficient)')
plt.title('Income Coefficient Across Quantiles')
plt.legend()
plt.show()
The rising slope with tau confirms heteroscedasticity.
Bootstrap Inference¶
In [ ]:
Copied!
from pinball import bootstrap
model = QuantileRegressor(tau=0.5, method='fn')
model.fit(X, y)
bs = bootstrap(model, X, y, method='xy-pair', nboot=500, random_state=42)
print(f'Bootstrap SE: {bs.std_errors}')
print(f'95% CI: {bs.conf_int}')
plt.figure(figsize=(7, 4))
plt.hist(bs.boot_coefficients[:, 1], bins=40, alpha=0.7, color='steelblue')
plt.axvline(bs.coefficients[1], color='red', ls='--', lw=2, label='Point estimate')
plt.xlabel('Income coefficient')
plt.title('Bootstrap Distribution (500 xy-pair replicates)')
plt.legend()
plt.show()
from pinball import bootstrap
model = QuantileRegressor(tau=0.5, method='fn')
model.fit(X, y)
bs = bootstrap(model, X, y, method='xy-pair', nboot=500, random_state=42)
print(f'Bootstrap SE: {bs.std_errors}')
print(f'95% CI: {bs.conf_int}')
plt.figure(figsize=(7, 4))
plt.hist(bs.boot_coefficients[:, 1], bins=40, alpha=0.7, color='steelblue')
plt.axvline(bs.coefficients[1], color='red', ls='--', lw=2, label='Point estimate')
plt.xlabel('Income coefficient')
plt.title('Bootstrap Distribution (500 xy-pair replicates)')
plt.legend()
plt.show()