Source code for stepmix.bootstrap

"""Utility functions for model bootstrapping and confidence intervals."""
import itertools
import pandas as pd
import warnings
import copy

import numpy as np
import tqdm

from sklearn.base import clone
from sklearn.utils.validation import check_random_state, check_is_fitted


def mse(x, y):
    return np.square(np.subtract(x, y)).mean()


[docs]def find_best_permutation(reference, target, criterion=mse): """Find the best permutation of the columns in target to minimize some criterion comparing to reference. Parameters ---------- reference : ndarray of shape (n_samples, n_columns) Reference array. target : ndarray of shape (n_samples, n_columns) Target array. criterion: Callable returning a scalar used to find the permutation. """ n_samples, n_columns = reference.shape score = np.inf best_perm = None permutations = itertools.permutations(np.arange(n_columns)) for p in permutations: score_p = criterion(reference, target[:, np.array(p)]) if score_p < score: score = score_p best_perm = p return np.array(best_perm)
[docs]def bootstrap( estimator, X, Y=None, n_repetitions=1000, sample_weight=None, parametric=False, sampler=None, identify_classes=True, progress_bar=True, random_state=None, ): """Parametric or Non-parametric boostrap of a StepMix estimator. Fit n_repetitions clones of the estimator on resampled datasets. If identify_classes=True, repeated parameter estimates are aligned with the class order of the main estimator using a permutation search. Parameters ---------- estimator : StepMix instance A fitted StepMix estimator. Used as a template to clone bootstrap estimator. X : array-like of shape (n_samples, n_features) Y : array-like of shape (n_samples, n_features_structural), default=None n_repetitions: int Number of repetitions to fit. sample_weight : array-like of shape(n_samples,), default=None Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Ignored if parametric=True. parametric: bool, default=False Use parametric bootstrap instead of non-parametric. Data will be generated by sampling the estimator. sampler: bool, default=None Another fitted estimator to use for sampling instead of the main estimator. Only used for parametric bootstrapping. identify_classes: bool, default=True Run a permutation test to align the classes of the repetitions to the classes of the main estimator. This is required if inference on the model parameters is needed, but can be turned off if only the likelihood needs to be bootstrapped to save computations. progress_bar : bool, default=True Display a tqdm progress bar for repetitions. random_state : int, default=None Returns ---------- samples: DataFrame DataFrame of all repetitions. Follows the convention of StepMix.get_parameters_df() with an additional 'rep' column. rep_stats: DataFrame Likelihood statistics of each repetition. 'rep' column. None if identy_classes=False. stats: DataFrame Various statistics of bootstrapped estimators. """ check_is_fitted(estimator) estimator = copy.deepcopy(estimator) estimator.set_params(random_state=random_state) if sampler is not None: check_is_fitted(sampler) sampler = copy.deepcopy(sampler) sampler.set_params(random_state=random_state) n_samples = X.shape[0] x_names = estimator.x_names_ y_names = estimator.y_names_ if hasattr(estimator, "y_names_") else None # Use the estimator built-in method to check the input # This will ensure that X and Y are numpy arrays for the rest of the bootstrap procedure X, Y = estimator._check_x_y(X, Y, reset=False) # Get class probabilities of main estimator as reference ref_class_probabilities = estimator.predict_proba(X, Y) # Raise warning if trying to permute too many columns if identify_classes and estimator.n_components > 6: warnings.warn( "Bootstrapping with identfy_classes=True requires permuting latent classes. Permuting latent classes with n_components > 6 may be slow." ) # Now fit n_repetitions estimator with resampling and save parameters rng = check_random_state(estimator.random_state) parameters = list() avg_ll_buffer = list() ll_buffer = list() if progress_bar: print("\nBootstrapping estimator...") tqdm_rep = tqdm.trange( n_repetitions, disable=not progress_bar, desc="Bootstrap Repetitions " ) for rep in tqdm_rep: # Resample data if parametric and sampler is not None: X_rep, Y_rep, _ = sampler.sample(n_samples) sample_weight_rep = None elif parametric: X_rep, Y_rep, _ = estimator.sample(n_samples) sample_weight_rep = None else: # Sample with replacement rep_samples = rng.choice(n_samples, size=(n_samples,), replace=True) X_rep = X[rep_samples] Y_rep = Y[rep_samples] if Y is not None else None sample_weight_rep = ( sample_weight[rep_samples] if sample_weight is not None else None ) # Fit estimator on resample data estimator_rep = clone(estimator) # Disable printing for repeated estimators and fit estimator_rep.set_params(verbose=0, progress_bar=0, random_state=random_state) estimator_rep.fit(X_rep, Y_rep, sample_weight=sample_weight_rep) # Class ordering may be different. Reorder based on best permutation of class probabilites if identify_classes: rep_class_probabilities = estimator_rep.predict_proba( X, Y ) # Inference on original samples perm = find_best_permutation( ref_class_probabilities, rep_class_probabilities ) estimator_rep.permute_classes(perm) # Save parameters df_i = estimator_rep.get_parameters_df(x_names, y_names) df_i["rep"] = rep parameters.append(df_i) # Save likelihood avg_ll = estimator_rep.score(X_rep, Y_rep, sample_weight=sample_weight_rep) ll = ( avg_ll * np.sum(sample_weight) if sample_weight is not None else avg_ll * n_samples ) avg_ll_buffer.append(avg_ll) ll_buffer.append(ll) # Ask tqdm to display current max likelihood tqdm_rep.set_postfix( median_LL=np.median(ll_buffer), # min_avg_LL=np.min(avg_ll_buffer), # max_avg_LL=np.max(avg_ll_buffer), min_LL=np.min(ll_buffer), max_LL=np.max(ll_buffer), ) if identify_classes: return_df = pd.concat(parameters) return_df.sort_index(inplace=True) else: # Do not return parameters if classes are not identified return_df = None # Add likelihoods statistics stats = {"LL": np.array(ll_buffer), "avg_LL": np.array(avg_ll_buffer)} return return_df, pd.DataFrame.from_dict(stats)
[docs]def blrt(null_model, alternative_model, X, Y=None, n_repetitions=30, random_state=42): """BLRT Test References ---------- Dziak, John J., Stephanie T. Lanza, and Xianming Tan. "Effect size, statistical power, and sample size requirements for the bootstrap likelihood ratio test in latent class analysis." Structural equation modeling: a multidisciplinary journal 21.4 (2014): 534-552. Nylund, Karen L., Tihomir Asparouhov, and Bengt O. Muthén. "Deciding on the number of classes in latent class analysis and growth mixture modeling: A Monte Carlo simulation study." Structural equation modeling: A multidisciplinary Journal 14.4 (2007): 535-569. Parameters ---------- null_model : StepMix instance A StepMix model with k classes. alternative_model : StepMix instance A StepMix model with k + 1 classes. X : array-like of shape (n_samples, n_features) Y : array-like of shape (n_samples, n_features_structural), default=None n_repetitions: int Number of repetitions to fit. random_state : int, default=None Returns ---------- p-value: float Bootstrap p-value of the BLRT test. A significant test indicates the alternative k + 1 model provides a significantly better fit of the data. """ n_samples = X.shape[0] # Fit both models on real data null_model.fit(X, Y) alternative_model.fit(X, Y) real_stat = 2 * (alternative_model.score(X, Y) - null_model.score(X, Y)) * n_samples # Bootstrap null model print("Bootstrapping null model...") _, stats_null = bootstrap( null_model, X, Y, n_repetitions=n_repetitions, identify_classes=False, sampler=null_model, random_state=random_state, parametric=True, ) print("\nBootstrapping alternative model...") _, stats_alternative = bootstrap( alternative_model, X, Y, n_repetitions=n_repetitions, identify_classes=False, sampler=null_model, random_state=random_state, parametric=True, ) gen_stats = 2 * (stats_alternative["LL"] - stats_null["LL"]) b = np.sum(gen_stats > real_stat) return b / n_repetitions
[docs]def blrt_sweep( model, X, Y=None, low=1, high=5, n_repetitions=30, random_state=42, verbose=True ): """Sweep BLRT Test Run BLRT test for a range of number of classes. For example, if you set low=1 and high=4, the function will return the result of 3 tests [1 vs 2, 2 vs 3, 3 vs 4]. Parameters ---------- model : StepMix instance A StepMix model. X : array-like of shape (n_samples, n_features) Y : array-like of shape (n_samples, n_features_structural), default=None low: int, default=1 Minimum number of classes to test. high: int, default=5 Maximum number of classes to test. n_repetitions: int Number of repetitions to fit. random_state : int, default=None verbose : bool, default=True Returns ---------- p-values: DataFrame Bootstrap p-values of the BLRT test for each comparison. """ test_string = list() p_values = list() for k in range(low, high): print(f"Testing {k} vs. {k + 1} classes...") null_model = clone(model) null_model.set_params(n_components=k) alternative_model = clone(model) alternative_model.set_params(n_components=k + 1) p_values.append( blrt( null_model, alternative_model, X, Y=Y, n_repetitions=n_repetitions, random_state=random_state, ) ) test_string.append(f"{k} vs. {k + 1} classes") df = pd.DataFrame({"Test": test_string, "p": p_values}).set_index("Test") if verbose: print("\nBLRT Sweep Results") print(df.round(4)) return df