Nonparametric Quantile Estimation via Quantization¶

Linear quantile regression assumes the conditional quantile is a linear function of the covariates. When the true relationship is nonlinear, we need a different approach.

The QuantizationQuantileEstimator uses optimal quantization to partition the covariate space into Voronoi cells, estimate quantiles within each cell, and average over multiple random partitions.

Reference: Charlier, Paindaveine & Saracco (2015). Conditional quantile estimation through optimal quantization. JSPI 156, 14–30.

In [ ]:

Copied!





import numpy as np
import matplotlib.pyplot as plt
from pinball import QuantileRegressor
from pinball.nonparametric.quantization import QuantizationQuantileEstimator

plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
rng = np.random.default_rng(42)
import numpy as np
import matplotlib.pyplot as plt
from pinball import QuantileRegressor
from pinball.nonparametric.quantization import QuantizationQuantileEstimator

plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
rng = np.random.default_rng(42)

Synthetic Nonlinear Data¶

We generate data from a sinusoidal model with heteroscedastic noise:

$$Y = \sin(X) + 0.3 \cdot (1 + 0.5X) \cdot \varepsilon, \quad X \sim U(0, 2\pi)$$

The conditional quantiles are nonlinear — linear methods cannot capture them.

In [ ]:

Copied!





n = 800
X_1d = rng.uniform(0, 2 * np.pi, n)
noise_scale = 0.3 * (1 + 0.5 * X_1d)
y = np.sin(X_1d) + noise_scale * rng.normal(size=n)

# Reshape for sklearn interface
X = X_1d.reshape(-1, 1)

# Sort for plotting
order = np.argsort(X_1d)

plt.scatter(X_1d, y, alpha=0.3, s=10, color="grey")
plt.plot(X_1d[order], np.sin(X_1d[order]), "k-", lw=2, label="True mean")
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Nonlinear Data with Heteroscedastic Noise")
plt.legend()
plt.show()
n = 800
X_1d = rng.uniform(0, 2 * np.pi, n)
noise_scale = 0.3 * (1 + 0.5 * X_1d)
y = np.sin(X_1d) + noise_scale * rng.normal(size=n)

# Reshape for sklearn interface
X = X_1d.reshape(-1, 1)

# Sort for plotting
order = np.argsort(X_1d)

plt.scatter(X_1d, y, alpha=0.3, s=10, color="grey")
plt.plot(X_1d[order], np.sin(X_1d[order]), "k-", lw=2, label="True mean")
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Nonlinear Data with Heteroscedastic Noise")
plt.legend()
plt.show()

Linear vs. Nonparametric: Median Regression¶

Let's compare the linear and nonparametric estimators at τ = 0.5.

In [ ]:

Copied!





# Fit both models
linear_model = QuantileRegressor(tau=0.5, method="fn")
linear_model.fit(X, y)

np_model = QuantizationQuantileEstimator(
    tau=0.5, N=30, n_grids=50, random_state=42
)
np_model.fit(X, y)

# Predict on a fine grid
x_grid = np.linspace(0, 2 * np.pi, 300).reshape(-1, 1)
y_linear = linear_model.predict(x_grid)
y_np = np_model.predict(x_grid)

plt.figure(figsize=(10, 6))
plt.scatter(X_1d, y, alpha=0.2, s=10, color="grey", label="Data")
plt.plot(x_grid, np.sin(x_grid), "k-", lw=2, label="True median")
plt.plot(x_grid, y_linear, "--", color="tab:red", lw=2, label="Linear QR")
plt.plot(x_grid, y_np, "-", color="tab:blue", lw=2, label="Quantization QR")
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Median Regression: Linear vs. Nonparametric")
plt.legend()
plt.show()
# Fit both models
linear_model = QuantileRegressor(tau=0.5, method="fn")
linear_model.fit(X, y)

np_model = QuantizationQuantileEstimator(
    tau=0.5, N=30, n_grids=50, random_state=42
)
np_model.fit(X, y)

# Predict on a fine grid
x_grid = np.linspace(0, 2 * np.pi, 300).reshape(-1, 1)
y_linear = linear_model.predict(x_grid)
y_np = np_model.predict(x_grid)

plt.figure(figsize=(10, 6))
plt.scatter(X_1d, y, alpha=0.2, s=10, color="grey", label="Data")
plt.plot(x_grid, np.sin(x_grid), "k-", lw=2, label="True median")
plt.plot(x_grid, y_linear, "--", color="tab:red", lw=2, label="Linear QR")
plt.plot(x_grid, y_np, "-", color="tab:blue", lw=2, label="Quantization QR")
plt.xlabel("X")
plt.ylabel("Y")
plt.title("Median Regression: Linear vs. Nonparametric")
plt.legend()
plt.show()

The linear model fits a straight line (completely missing the curvature), while the quantization estimator tracks the sinusoidal pattern closely.

Multiple Quantiles with the Nonparametric Estimator¶

We fit τ = 0.1, 0.5, 0.9 to show how the conditional distribution varies nonlinearly.

In [ ]:

Copied!





taus = [0.1, 0.25, 0.5, 0.75, 0.9]
colours = plt.cm.RdYlBu_r(np.linspace(0.1, 0.9, len(taus)))

plt.figure(figsize=(10, 6))
plt.scatter(X_1d, y, alpha=0.15, s=8, color="grey")

for tau, col in zip(taus, colours):
    model = QuantizationQuantileEstimator(
        tau=tau, N=30, n_grids=50, random_state=42
    )
    model.fit(X, y)
    y_hat = model.predict(x_grid)
    plt.plot(x_grid, y_hat, color=col, lw=2, label=f"τ = {tau}")

plt.xlabel("X")
plt.ylabel("Y")
plt.title("Nonparametric Quantile Regression — Multiple Quantiles")
plt.legend()
plt.show()
taus = [0.1, 0.25, 0.5, 0.75, 0.9]
colours = plt.cm.RdYlBu_r(np.linspace(0.1, 0.9, len(taus)))

plt.figure(figsize=(10, 6))
plt.scatter(X_1d, y, alpha=0.15, s=8, color="grey")

for tau, col in zip(taus, colours):
    model = QuantizationQuantileEstimator(
        tau=tau, N=30, n_grids=50, random_state=42
    )
    model.fit(X, y)
    y_hat = model.predict(x_grid)
    plt.plot(x_grid, y_hat, color=col, lw=2, label=f"τ = {tau}")

plt.xlabel("X")
plt.ylabel("Y")
plt.title("Nonparametric Quantile Regression — Multiple Quantiles")
plt.legend()
plt.show()

Notice how the width between the quantile curves varies — wider where noise_scale is larger (right side of the plot). The nonparametric estimator captures both the nonlinear mean and the heteroscedastic spread.

Effect of Grid Size N¶

The parameter N controls how finely the covariate space is partitioned. Small N gives a coarse approximation; large N can give noisy estimates (especially with limited data).

In [ ]:

Copied!





fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)

for ax, N in zip(axes, [5, 20, 50]):
    model = QuantizationQuantileEstimator(
        tau=0.5, N=N, n_grids=50, random_state=42
    )
    model.fit(X, y)
    y_hat = model.predict(x_grid)
    
    ax.scatter(X_1d, y, alpha=0.15, s=5, color="grey")
    ax.plot(x_grid, np.sin(x_grid), "k-", lw=1.5, label="Truth")
    ax.plot(x_grid, y_hat, "-", color="tab:blue", lw=2, label=f"N={N}")
    ax.set_title(f"N = {N}")
    ax.set_xlabel("X")
    ax.legend(fontsize=9)

axes[0].set_ylabel("Y")
plt.suptitle("Effect of Grid Size N on the Quantization Estimator", y=1.02)
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)

for ax, N in zip(axes, [5, 20, 50]):
    model = QuantizationQuantileEstimator(
        tau=0.5, N=N, n_grids=50, random_state=42
    )
    model.fit(X, y)
    y_hat = model.predict(x_grid)
    
    ax.scatter(X_1d, y, alpha=0.15, s=5, color="grey")
    ax.plot(x_grid, np.sin(x_grid), "k-", lw=1.5, label="Truth")
    ax.plot(x_grid, y_hat, "-", color="tab:blue", lw=2, label=f"N={N}")
    ax.set_title(f"N = {N}")
    ax.set_xlabel("X")
    ax.legend(fontsize=9)

axes[0].set_ylabel("Y")
plt.suptitle("Effect of Grid Size N on the Quantization Estimator", y=1.02)
plt.tight_layout()
plt.show()

Using in an sklearn Pipeline¶

The quantization estimator is fully sklearn-compatible and can be used in pipelines, cross-validation, and grid search.

In [ ]:

Copied!





from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("quant", QuantizationQuantileEstimator(tau=0.5, N=25, random_state=42)),
])

scores = cross_val_score(pipe, X, y, cv=5, scoring="r2")
print(f"5-fold CV R² scores: {scores}")
print(f"Mean R² = {scores.mean():.3f} ± {scores.std():.3f}")
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("quant", QuantizationQuantileEstimator(tau=0.5, N=25, random_state=42)),
])

scores = cross_val_score(pipe, X, y, cv=5, scoring="r2")
print(f"5-fold CV R² scores: {scores}")
print(f"Mean R² = {scores.mean():.3f} ± {scores.std():.3f}")

2D Example: Bivariate Covariates¶

The quantization estimator generalises naturally to multiple covariates.

In [ ]:

Copied!





# Generate 2D data: Y = sin(X1) * cos(X2) + noise
n2 = 1000
X2 = rng.uniform(0, 2 * np.pi, (n2, 2))
y2 = np.sin(X2[:, 0]) * np.cos(X2[:, 1]) + 0.3 * rng.normal(size=n2)

model_2d = QuantizationQuantileEstimator(
    tau=0.5, N=40, n_grids=50, random_state=42
)
model_2d.fit(X2, y2)

# Predict on a grid
x1_grid = np.linspace(0, 2 * np.pi, 50)
x2_grid = np.linspace(0, 2 * np.pi, 50)
X1g, X2g = np.meshgrid(x1_grid, x2_grid)
X_pred = np.column_stack([X1g.ravel(), X2g.ravel()])
Y_pred = model_2d.predict(X_pred).reshape(X1g.shape)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# True surface
Y_true = np.sin(X1g) * np.cos(X2g)
im0 = axes[0].contourf(X1g, X2g, Y_true, levels=20, cmap="RdYlBu_r")
axes[0].set_title("True Median Surface")
axes[0].set_xlabel("X₁")
axes[0].set_ylabel("X₂")
plt.colorbar(im0, ax=axes[0])

# Estimated surface
im1 = axes[1].contourf(X1g, X2g, Y_pred, levels=20, cmap="RdYlBu_r")
axes[1].set_title("Estimated Median (N=40, B=50)")
axes[1].set_xlabel("X₁")
axes[1].set_ylabel("X₂")
plt.colorbar(im1, ax=axes[1])

plt.suptitle("2D Nonparametric Quantile Regression", y=1.02)
plt.tight_layout()
plt.show()
# Generate 2D data: Y = sin(X1) * cos(X2) + noise
n2 = 1000
X2 = rng.uniform(0, 2 * np.pi, (n2, 2))
y2 = np.sin(X2[:, 0]) * np.cos(X2[:, 1]) + 0.3 * rng.normal(size=n2)

model_2d = QuantizationQuantileEstimator(
    tau=0.5, N=40, n_grids=50, random_state=42
)
model_2d.fit(X2, y2)

# Predict on a grid
x1_grid = np.linspace(0, 2 * np.pi, 50)
x2_grid = np.linspace(0, 2 * np.pi, 50)
X1g, X2g = np.meshgrid(x1_grid, x2_grid)
X_pred = np.column_stack([X1g.ravel(), X2g.ravel()])
Y_pred = model_2d.predict(X_pred).reshape(X1g.shape)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# True surface
Y_true = np.sin(X1g) * np.cos(X2g)
im0 = axes[0].contourf(X1g, X2g, Y_true, levels=20, cmap="RdYlBu_r")
axes[0].set_title("True Median Surface")
axes[0].set_xlabel("X₁")
axes[0].set_ylabel("X₂")
plt.colorbar(im0, ax=axes[0])

# Estimated surface
im1 = axes[1].contourf(X1g, X2g, Y_pred, levels=20, cmap="RdYlBu_r")
axes[1].set_title("Estimated Median (N=40, B=50)")
axes[1].set_xlabel("X₁")
axes[1].set_ylabel("X₂")
plt.colorbar(im1, ax=axes[1])

plt.suptitle("2D Nonparametric Quantile Regression", y=1.02)
plt.tight_layout()
plt.show()

Summary¶

Feature	Linear QR	Quantization QR
Functional form	Linear	Any
Coefficients	✓	✗ (cell-based)
Standard errors	✓ (rank, sandwich, boot)	✗
Curse of dimensionality	No	Yes
Speed	Fast (Fortran LP)	Moderate
Best for	Interpretable linear effects	Nonlinear patterns