"""Utility functions for model bootstrapping and confidence intervals."""
import itertools
import pandas as pd
import warnings
import copy
import numpy as np
import tqdm
from sklearn.base import clone
from sklearn.utils.validation import check_random_state, check_is_fitted
def mse(x, y):
return np.square(np.subtract(x, y)).mean()
[docs]def find_best_permutation(reference, target, criterion=mse):
"""Find the best permutation of the columns in target to minimize some
criterion comparing to reference.
Parameters
----------
reference : ndarray of shape (n_samples, n_columns)
Reference array.
target : ndarray of shape (n_samples, n_columns)
Target array.
criterion: Callable returning a scalar used to find the permutation.
"""
n_samples, n_columns = reference.shape
score = np.inf
best_perm = None
permutations = itertools.permutations(np.arange(n_columns))
for p in permutations:
score_p = criterion(reference, target[:, np.array(p)])
if score_p < score:
score = score_p
best_perm = p
return np.array(best_perm)
[docs]def bootstrap(
estimator,
X,
Y=None,
n_repetitions=1000,
sample_weight=None,
parametric=False,
sampler=None,
identify_classes=True,
progress_bar=True,
random_state=None,
):
"""Parametric or Non-parametric boostrap of a StepMix estimator.
Fit n_repetitions clones of the estimator on resampled datasets.
If identify_classes=True, repeated parameter estimates are aligned with the class order of the main estimator using
a permutation search.
Parameters
----------
estimator : StepMix instance
A fitted StepMix estimator. Used as a template to clone bootstrap estimator.
X : array-like of shape (n_samples, n_features)
Y : array-like of shape (n_samples, n_features_structural), default=None
n_repetitions: int
Number of repetitions to fit.
sample_weight : array-like of shape(n_samples,), default=None
Array of weights that are assigned to individual samples.
If not provided, then each sample is given unit weight. Ignored if parametric=True.
parametric: bool, default=False
Use parametric bootstrap instead of non-parametric. Data will be generated by sampling the estimator.
sampler: bool, default=None
Another fitted estimator to use for sampling instead of the main estimator. Only used for parametric
bootstrapping.
identify_classes: bool, default=True
Run a permutation test to align the classes of the repetitions to the classes of the main estimator. This is
required if inference on the model parameters is needed, but can be turned off if only the likelihood needs
to be bootstrapped to save computations.
progress_bar : bool, default=True
Display a tqdm progress bar for repetitions.
random_state : int, default=None
Returns
----------
samples: DataFrame
DataFrame of all repetitions. Follows the convention of StepMix.get_parameters_df() with an additional
'rep' column.
rep_stats: DataFrame
Likelihood statistics of each repetition.
'rep' column. None if identy_classes=False.
stats: DataFrame
Various statistics of bootstrapped estimators.
"""
check_is_fitted(estimator)
estimator = copy.deepcopy(estimator)
estimator.set_params(random_state=random_state)
if sampler is not None:
check_is_fitted(sampler)
sampler = copy.deepcopy(sampler)
sampler.set_params(random_state=random_state)
n_samples = X.shape[0]
x_names = estimator.x_names_
y_names = estimator.y_names_ if hasattr(estimator, "y_names_") else None
# Use the estimator built-in method to check the input
# This will ensure that X and Y are numpy arrays for the rest of the bootstrap procedure
X, Y = estimator._check_x_y(X, Y, reset=False)
# Get class probabilities of main estimator as reference
ref_class_probabilities = estimator.predict_proba(X, Y)
# Raise warning if trying to permute too many columns
if identify_classes and estimator.n_components > 6:
warnings.warn(
"Bootstrapping with identfy_classes=True requires permuting latent classes. Permuting latent classes with n_components > 6 may be slow."
)
# Now fit n_repetitions estimator with resampling and save parameters
rng = check_random_state(estimator.random_state)
parameters = list()
avg_ll_buffer = list()
ll_buffer = list()
if progress_bar:
print("\nBootstrapping estimator...")
tqdm_rep = tqdm.trange(
n_repetitions, disable=not progress_bar, desc="Bootstrap Repetitions "
)
for rep in tqdm_rep:
# Resample data
if parametric and sampler is not None:
X_rep, Y_rep, _ = sampler.sample(n_samples)
sample_weight_rep = None
elif parametric:
X_rep, Y_rep, _ = estimator.sample(n_samples)
sample_weight_rep = None
else:
# Sample with replacement
rep_samples = rng.choice(n_samples, size=(n_samples,), replace=True)
X_rep = X[rep_samples]
Y_rep = Y[rep_samples] if Y is not None else None
sample_weight_rep = (
sample_weight[rep_samples] if sample_weight is not None else None
)
# Fit estimator on resample data
estimator_rep = clone(estimator)
# Disable printing for repeated estimators and fit
estimator_rep.set_params(verbose=0, progress_bar=0, random_state=random_state)
estimator_rep.fit(X_rep, Y_rep, sample_weight=sample_weight_rep)
# Class ordering may be different. Reorder based on best permutation of class probabilites
if identify_classes:
rep_class_probabilities = estimator_rep.predict_proba(
X, Y
) # Inference on original samples
perm = find_best_permutation(
ref_class_probabilities, rep_class_probabilities
)
estimator_rep.permute_classes(perm)
# Save parameters
df_i = estimator_rep.get_parameters_df(x_names, y_names)
df_i["rep"] = rep
parameters.append(df_i)
# Save likelihood
avg_ll = estimator_rep.score(X_rep, Y_rep, sample_weight=sample_weight_rep)
ll = (
avg_ll * np.sum(sample_weight)
if sample_weight is not None
else avg_ll * n_samples
)
avg_ll_buffer.append(avg_ll)
ll_buffer.append(ll)
# Ask tqdm to display current max likelihood
tqdm_rep.set_postfix(
median_LL=np.median(ll_buffer),
# min_avg_LL=np.min(avg_ll_buffer),
# max_avg_LL=np.max(avg_ll_buffer),
min_LL=np.min(ll_buffer),
max_LL=np.max(ll_buffer),
)
if identify_classes:
return_df = pd.concat(parameters)
return_df.sort_index(inplace=True)
else:
# Do not return parameters if classes are not identified
return_df = None
# Add likelihoods statistics
stats = {"LL": np.array(ll_buffer), "avg_LL": np.array(avg_ll_buffer)}
return return_df, pd.DataFrame.from_dict(stats)
[docs]def blrt(null_model, alternative_model, X, Y=None, n_repetitions=30, random_state=42):
"""BLRT Test
References
----------
Dziak, John J., Stephanie T. Lanza, and Xianming Tan. "Effect size, statistical power, and sample size requirements for the bootstrap likelihood ratio test in latent class analysis." Structural equation modeling: a multidisciplinary journal 21.4 (2014): 534-552.
Nylund, Karen L., Tihomir Asparouhov, and Bengt O. Muthén. "Deciding on the number of classes in latent class analysis and growth mixture modeling: A Monte Carlo simulation study." Structural equation modeling: A multidisciplinary Journal 14.4 (2007): 535-569.
Parameters
----------
null_model : StepMix instance
A StepMix model with k classes.
alternative_model : StepMix instance
A StepMix model with k + 1 classes.
X : array-like of shape (n_samples, n_features)
Y : array-like of shape (n_samples, n_features_structural), default=None
n_repetitions: int
Number of repetitions to fit.
random_state : int, default=None
Returns
----------
p-value: float
Bootstrap p-value of the BLRT test. A significant test indicates the alternative k + 1 model provides a
significantly better fit of the data.
"""
n_samples = X.shape[0]
# Fit both models on real data
null_model.fit(X, Y)
alternative_model.fit(X, Y)
real_stat = 2 * (alternative_model.score(X, Y) - null_model.score(X, Y)) * n_samples
# Bootstrap null model
print("Bootstrapping null model...")
_, stats_null = bootstrap(
null_model,
X,
Y,
n_repetitions=n_repetitions,
identify_classes=False,
sampler=null_model,
random_state=random_state,
parametric=True,
)
print("\nBootstrapping alternative model...")
_, stats_alternative = bootstrap(
alternative_model,
X,
Y,
n_repetitions=n_repetitions,
identify_classes=False,
sampler=null_model,
random_state=random_state,
parametric=True,
)
gen_stats = 2 * (stats_alternative["LL"] - stats_null["LL"])
b = np.sum(gen_stats > real_stat)
return b / n_repetitions
[docs]def blrt_sweep(
model, X, Y=None, low=1, high=5, n_repetitions=30, random_state=42, verbose=True
):
"""Sweep BLRT Test
Run BLRT test for a range of number of classes. For example, if you set low=1 and high=4, the function
will return the result of 3 tests [1 vs 2, 2 vs 3, 3 vs 4].
Parameters
----------
model : StepMix instance
A StepMix model.
X : array-like of shape (n_samples, n_features)
Y : array-like of shape (n_samples, n_features_structural), default=None
low: int, default=1
Minimum number of classes to test.
high: int, default=5
Maximum number of classes to test.
n_repetitions: int
Number of repetitions to fit.
random_state : int, default=None
verbose : bool, default=True
Returns
----------
p-values: DataFrame
Bootstrap p-values of the BLRT test for each comparison.
"""
test_string = list()
p_values = list()
for k in range(low, high):
print(f"Testing {k} vs. {k + 1} classes...")
null_model = clone(model)
null_model.set_params(n_components=k)
alternative_model = clone(model)
alternative_model.set_params(n_components=k + 1)
p_values.append(
blrt(
null_model,
alternative_model,
X,
Y=Y,
n_repetitions=n_repetitions,
random_state=random_state,
)
)
test_string.append(f"{k} vs. {k + 1} classes")
df = pd.DataFrame({"Test": test_string, "p": p_values}).set_index("Test")
if verbose:
print("\nBLRT Sweep Results")
print(df.round(4))
return df