In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from vflow import Vset, init_args, dict_to_df, filter_vset_by_metric, perturbation_stats
from vflow.pipeline import build_graph
from functools import partial
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.utils
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.inspection import permutation_importance
Enhancer Genomics¶
This vflow
pipeline uses fMRI data from specific voxels in the brain, in response to being shown certain frames of a video, to predict the mean BOLD (blood oxygenation level dependent) imaging response.
In [2]:
# load data
data_dir = "./data/enhancer/"
X_train = np.asarray(pd.read_csv(data_dir + "01_X_train.csv", error_bad_lines=False).iloc[:, 1:])
X_test = np.asarray(pd.read_csv(data_dir + "02_X_test.csv", error_bad_lines=False).iloc[:, 1:])
y_train = np.asarray(pd.read_csv(data_dir + "03_y_train.csv", error_bad_lines=False).iloc[:, 1])
y_test = np.asarray(pd.read_csv(data_dir + "04_y_test.csv", error_bad_lines=False).iloc[:, 1])
# initialize data
np.random.seed(14)
X_train, X_test, y_train, y_test = init_args((X_train, X_test, y_train, y_test),
names=['X_train', 'X_test', 'y_train', 'y_test'])
# subsample
subsampling_fns = [partial(sklearn.utils.resample, n_samples=1000, random_state=i) for i in range(3)]
subsampling_set = Vset(name='subsampling', vfuncs=subsampling_fns)
X_trains, y_trains = subsampling_set(X_train, y_train)
modeling_set = Vset(name='modeling',
vfuncs=[RandomForestClassifier(n_estimators=50, max_depth=5), MLPClassifier()],
vfunc_keys=["RF", "MLP"])
# model
modeling_set.fit(X_trains, y_trains)
preds = modeling_set.predict(X_test)
# hard metrics
hard_metrics_set = Vset(name='hard_metrics', vfuncs=[accuracy_score, balanced_accuracy_score],
vfunc_keys=["Acc", "Bal_Acc"])
hard_metrics = hard_metrics_set.evaluate(preds, y_test)
# permutation importance
feature_importance_set = Vset(name='feature_importance', vfuncs=[permutation_importance])
importances = feature_importance_set.evaluate(modeling_set.fitted_vfuncs, X_test, y_test)
df = dict_to_df(hard_metrics)
df
/tmp/ipykernel_134219/3301093220.py:3: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future. X_train = np.asarray(pd.read_csv(data_dir + "01_X_train.csv", error_bad_lines=False).iloc[:, 1:]) /tmp/ipykernel_134219/3301093220.py:4: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future. X_test = np.asarray(pd.read_csv(data_dir + "02_X_test.csv", error_bad_lines=False).iloc[:, 1:]) /tmp/ipykernel_134219/3301093220.py:5: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future. y_train = np.asarray(pd.read_csv(data_dir + "03_y_train.csv", error_bad_lines=False).iloc[:, 1]) /tmp/ipykernel_134219/3301093220.py:6: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future. y_test = np.asarray(pd.read_csv(data_dir + "04_y_test.csv", error_bad_lines=False).iloc[:, 1]) /home/james/.local/share/virtualenvs/veridical-flow-zFhOijFB/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py:702: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn( /home/james/.local/share/virtualenvs/veridical-flow-zFhOijFB/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py:702: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn( /home/james/.local/share/virtualenvs/veridical-flow-zFhOijFB/lib/python3.10/site-packages/sklearn/neural_network/_multilayer_perceptron.py:702: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn( /home/james/repos/Yu-Group/veridical-flow/vflow/utils.py:225: FutureWarning: DataFrame.set_axis 'inplace' keyword is deprecated and will be removed in a future version. Use `obj = obj.set_axis(..., copy=False)` instead df.set_axis(cols, axis=1, inplace=True)
Out[2]:
init-subsampling | init-subsampling | subsampling | init-modeling | modeling | init-hard_metrics | hard_metrics | out | |
---|---|---|---|---|---|---|---|---|
0 | X_test | X_train | subsampling_0 | y_train | RF | y_test | Acc | 0.926097 |
1 | X_test | X_train | subsampling_1 | y_train | RF | y_test | Acc | 0.924557 |
2 | X_test | X_train | subsampling_2 | y_train | RF | y_test | Acc | 0.924044 |
3 | X_test | X_train | subsampling_0 | y_train | MLP | y_test | Acc | 0.907365 |
4 | X_test | X_train | subsampling_1 | y_train | MLP | y_test | Acc | 0.907878 |
5 | X_test | X_train | subsampling_2 | y_train | MLP | y_test | Acc | 0.901463 |
6 | X_test | X_train | subsampling_0 | y_train | RF | y_test | Bal_Acc | 0.826515 |
7 | X_test | X_train | subsampling_1 | y_train | RF | y_test | Bal_Acc | 0.794172 |
8 | X_test | X_train | subsampling_2 | y_train | RF | y_test | Bal_Acc | 0.807060 |
9 | X_test | X_train | subsampling_0 | y_train | MLP | y_test | Bal_Acc | 0.700783 |
10 | X_test | X_train | subsampling_1 | y_train | MLP | y_test | Bal_Acc | 0.705016 |
11 | X_test | X_train | subsampling_2 | y_train | MLP | y_test | Bal_Acc | 0.683108 |
In [ ]: