Biomechanical features of orthopedic patients¶

In [ ]:
# load libraries

import numpy as np
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn import metrics 
from sklearn import preprocessing 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
In [ ]:
# load data

data = pd.read_csv(r"C:\Users\felix\OneDrive\Dokumente\Python Projects\BioMed Case\column_2C_weka.csv", sep=",")

data.head(10)
Out[ ]:
pelvic_incidence pelvic_tilt numeric lumbar_lordosis_angle sacral_slope pelvic_radius degree_spondylolisthesis class
0 63.027817 22.552586 39.609117 40.475232 98.672917 -0.254400 Abnormal
1 39.056951 10.060991 25.015378 28.995960 114.405425 4.564259 Abnormal
2 68.832021 22.218482 50.092194 46.613539 105.985135 -3.530317 Abnormal
3 69.297008 24.652878 44.311238 44.644130 101.868495 11.211523 Abnormal
4 49.712859 9.652075 28.317406 40.060784 108.168725 7.918501 Abnormal
5 40.250200 13.921907 25.124950 26.328293 130.327871 2.230652 Abnormal
6 53.432928 15.864336 37.165934 37.568592 120.567523 5.988551 Abnormal
7 45.366754 10.755611 29.038349 34.611142 117.270067 -10.675871 Abnormal
8 43.790190 13.533753 42.690814 30.256437 125.002893 13.289018 Abnormal
9 36.686353 5.010884 41.948751 31.675469 84.241415 0.664437 Abnormal
In [ ]:
# replace normal and abnormal by binary values
 
for index, row in data.iterrows():
    if row['class'] == 'Normal':
        data.at[index, 'class'] = 0
    else:
        data.at[index, 'class'] = 1

data.head(10)
Out[ ]:
pelvic_incidence pelvic_tilt numeric lumbar_lordosis_angle sacral_slope pelvic_radius degree_spondylolisthesis class
0 63.027817 22.552586 39.609117 40.475232 98.672917 -0.254400 1
1 39.056951 10.060991 25.015378 28.995960 114.405425 4.564259 1
2 68.832021 22.218482 50.092194 46.613539 105.985135 -3.530317 1
3 69.297008 24.652878 44.311238 44.644130 101.868495 11.211523 1
4 49.712859 9.652075 28.317406 40.060784 108.168725 7.918501 1
5 40.250200 13.921907 25.124950 26.328293 130.327871 2.230652 1
6 53.432928 15.864336 37.165934 37.568592 120.567523 5.988551 1
7 45.366754 10.755611 29.038349 34.611142 117.270067 -10.675871 1
8 43.790190 13.533753 42.690814 30.256437 125.002893 13.289018 1
9 36.686353 5.010884 41.948751 31.675469 84.241415 0.664437 1
In [ ]:
data.groupby('class')['class'].count()
Out[ ]:
class
0    100
1    210
Name: class, dtype: int64

Split data into training/test sets -> but first check propensities and make sure they stay the same for each split

In [ ]:
def data_split(examples, labels, train_frac, random_state=None):

    
    assert train_frac >= 0 and train_frac <= 1, "Invalid training set fraction"

    X_train, X_tmp, Y_train, Y_tmp = sklearn.model_selection.train_test_split(
                                        examples, labels, train_size=train_frac, random_state=random_state)

    X_val, X_test, Y_val, Y_test   = sklearn.model_selection.train_test_split(
                                        X_tmp, Y_tmp, train_size=0.5, random_state=random_state)

    return X_train, X_val, X_test,  Y_train, Y_val, Y_test
In [ ]:
np.random.seed(1234)
train, validate, test, train_labels, val_labels, test_labels = data_split(
    examples=data.drop('class', axis=1),  
    labels=data['class'], 
    train_frac=0.6, 
    random_state=1234
)
In [ ]:
train_labels.value_counts()
Out[ ]:
class
1    124
0     62
Name: count, dtype: int64
In [ ]:
val_labels.value_counts()
Out[ ]:
class
1    48
0    14
Name: count, dtype: int64
In [ ]:
test_labels.value_counts()
Out[ ]:
class
1    38
0    24
Name: count, dtype: int64

no class imbalances - in each set, there are approximately equal class distribution (2:1)

In [ ]:
train.dtypes
Out[ ]:
pelvic_incidence            float64
pelvic_tilt numeric         float64
lumbar_lordosis_angle       float64
sacral_slope                float64
pelvic_radius               float64
degree_spondylolisthesis    float64
dtype: object
In [ ]:
# check for missing values

null_value_rows = train.iloc[:, 0:6].isna().any()

null_value_rows
Out[ ]:
pelvic_incidence            False
pelvic_tilt numeric         False
lumbar_lordosis_angle       False
sacral_slope                False
pelvic_radius               False
degree_spondylolisthesis    False
dtype: bool
In [ ]:
train.info()
<class 'pandas.core.frame.DataFrame'>
Index: 186 entries, 104 to 303
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelvic_incidence          186 non-null    float64
 1   pelvic_tilt numeric       186 non-null    float64
 2   lumbar_lordosis_angle     186 non-null    float64
 3   sacral_slope              186 non-null    float64
 4   pelvic_radius             186 non-null    float64
 5   degree_spondylolisthesis  186 non-null    float64
dtypes: float64(6)
memory usage: 10.2 KB
In [ ]:
train.describe(percentiles=[.25,.50,.75])
Out[ ]:
pelvic_incidence pelvic_tilt numeric lumbar_lordosis_angle sacral_slope pelvic_radius degree_spondylolisthesis
count 186.000000 186.000000 186.000000 186.000000 186.000000 186.000000
mean 59.139559 17.242256 51.405375 41.897303 118.590296 23.929936
std 15.969392 9.190825 18.970799 12.009732 13.034390 30.057371
min 26.147921 -0.810514 14.000000 13.516568 82.456038 -11.058179
25% 46.387157 10.390261 37.000000 32.886361 111.530796 1.530704
50% 57.167662 16.031278 47.627403 41.282220 118.784828 10.637436
75% 70.425687 21.934926 62.223469 50.675994 125.667927 38.240343
max 96.657315 42.689195 125.742385 77.195734 163.071041 145.378143

Outlier analysis

In [ ]:
for i, col in enumerate(train.columns[0:6]):
    plt.figure(i)
    sns.distplot(train[col])
In [ ]:
corr_df = pd.concat([train, train_labels], axis=1)
corr = corr_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
Out[ ]:
[Text(0.5, 0, 'pelvic_incidence'),
 Text(1.5, 0, 'pelvic_tilt numeric'),
 Text(2.5, 0, 'lumbar_lordosis_angle'),
 Text(3.5, 0, 'sacral_slope'),
 Text(4.5, 0, 'pelvic_radius'),
 Text(5.5, 0, 'degree_spondylolisthesis'),
 Text(6.5, 0, 'class')]
In [ ]:
# Concatenate the training and validation sets for cross-validation
x_train_validate = pd.concat([train, validate])
y_train_validate = pd.concat([train_labels, val_labels])

# make sure dependent variable is in integer dtype
y_train_validate = y_train_validate.astype(int)
test_labels = test_labels.astype(int)

# normalize independent variables as preprocessing step
x_train_validate = preprocessing.normalize(x_train_validate)
test = preprocessing.normalize(test)
In [ ]:
# set seed for all models

np.random.seed(1234)
seed = 1234

Modeling: KNN¶

In [ ]:
knn_model = KNeighborsClassifier()

# set up a tuning grid
param_grid = {'n_neighbors': range(1, 21)}

# metric
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': make_scorer(roc_auc_score, greater_is_better=True)
}


# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(knn_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring=scoring, refit='accuracy')
grid_search.fit(x_train_validate, y_train_validate)
Out[ ]:
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)}, refit='accuracy',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'roc_auc': make_scorer(roc_auc_score)})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)}, refit='accuracy',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'roc_auc': make_scorer(roc_auc_score)})
KNeighborsClassifier()
KNeighborsClassifier()
In [ ]:
# Print the best k, accuracy, and ROC AUC
print("Best k:", grid_search.best_params_['n_neighbors'])
print("Best Accuracy:", grid_search.best_score_)
print("Best ROC AUC:", grid_search.cv_results_['mean_test_roc_auc'][grid_search.best_index_])
Best k: 17
Best Accuracy: 0.822938775510204
Best ROC AUC: 0.8025630252100839
In [ ]:
best_k = grid_search.best_params_['n_neighbors']
final_knn_model = KNeighborsClassifier(n_neighbors=best_k)
final_knn_model.fit(x_train_validate, y_train_validate)
Out[ ]:
KNeighborsClassifier(n_neighbors=17)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=17)
In [ ]:
# Evaluate the final model on the test set
knn_y_pred_test = final_knn_model.predict(test)
knn_accuracy_test = accuracy_score(test_labels, knn_y_pred_test)
knn_roc_auc_test = roc_auc_score(test_labels, knn_y_pred_test)

print("Accuracy on Test Set:", knn_accuracy_test)
print("ROC AUC on Test Set:", knn_roc_auc_test)
Accuracy on Test Set: 0.7903225806451613
ROC AUC on Test Set: 0.7828947368421053
In [ ]:
# Confusion Matrix
knn_conf_mat = confusion_matrix(test_labels, knn_y_pred_test)
print("Confusion Matrix:\n", knn_conf_mat)
Confusion Matrix:
 [[18  6]
 [ 7 31]]
In [ ]:
# Calculate Sensitivity and Specificity
knn_tn, knn_fp, knn_fn, knn_tp = knn_conf_mat.ravel()

knn_sensitivity = knn_tp / (knn_tp + knn_fn)
knn_specificity = knn_tn / (knn_tn + knn_fp)

print(f"Sensitivity (True Positive Rate): {knn_sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {knn_specificity:.4f}")
Sensitivity (True Positive Rate): 0.8158
Specificity (True Negative Rate): 0.7500

Modeling: Lasso¶

In [ ]:
lasso_model = LogisticRegression(penalty='l1', solver='liblinear')

lasso_param_grid = {'C': np.logspace(-3, 3, 100)}

lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=5, scoring='accuracy')  
lasso_grid_search.fit(x_train_validate, y_train_validate)
Out[ ]:
GridSearchCV(cv=5,
             estimator=LogisticRegression(penalty='l1', solver='liblinear'),
             param_grid={'C': array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669923e-03, 6.13590727e-03, 7.05480231e-03, 8.11130831e-03,
       9.32603...
       4.03701726e+01, 4.64158883e+01, 5.33669923e+01, 6.13590727e+01,
       7.05480231e+01, 8.11130831e+01, 9.32603347e+01, 1.07226722e+02,
       1.23284674e+02, 1.41747416e+02, 1.62975083e+02, 1.87381742e+02,
       2.15443469e+02, 2.47707636e+02, 2.84803587e+02, 3.27454916e+02,
       3.76493581e+02, 4.32876128e+02, 4.97702356e+02, 5.72236766e+02,
       6.57933225e+02, 7.56463328e+02, 8.69749003e+02, 1.00000000e+03])},
             scoring='accuracy')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=LogisticRegression(penalty='l1', solver='liblinear'),
             param_grid={'C': array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669923e-03, 6.13590727e-03, 7.05480231e-03, 8.11130831e-03,
       9.32603...
       4.03701726e+01, 4.64158883e+01, 5.33669923e+01, 6.13590727e+01,
       7.05480231e+01, 8.11130831e+01, 9.32603347e+01, 1.07226722e+02,
       1.23284674e+02, 1.41747416e+02, 1.62975083e+02, 1.87381742e+02,
       2.15443469e+02, 2.47707636e+02, 2.84803587e+02, 3.27454916e+02,
       3.76493581e+02, 4.32876128e+02, 4.97702356e+02, 5.72236766e+02,
       6.57933225e+02, 7.56463328e+02, 8.69749003e+02, 1.00000000e+03])},
             scoring='accuracy')
LogisticRegression(penalty='l1', solver='liblinear')
LogisticRegression(penalty='l1', solver='liblinear')
In [ ]:
# Print the hyperparameter
best_C = lasso_grid_search.best_params_['C']
print("Best C:", best_C)
Best C: 572.236765935022
In [ ]:
# Train the final Lasso model with the best hyperparameter on the entire training set
best_lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C=best_C, random_state=seed)
best_lasso_model.fit(x_train_validate, y_train_validate)
Out[ ]:
LogisticRegression(C=572.236765935022, penalty='l1', random_state=1234,
                   solver='liblinear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=572.236765935022, penalty='l1', random_state=1234,
                   solver='liblinear')
In [ ]:
# Evaluate the final Lasso model on the test set
lasso_y_pred_test = best_lasso_model.predict(test)

lasso_accuracy_test = accuracy_score(test_labels, lasso_y_pred_test)
lasso_roc_auc_test = roc_auc_score(test_labels, lasso_y_pred_test)

print("Accuracy on Test Set:", lasso_accuracy_test)
print("ROC AUC on Test Set:", lasso_roc_auc_test)
Accuracy on Test Set: 0.7741935483870968
ROC AUC on Test Set: 0.7543859649122807
In [ ]:
# Confusion Matrix
lasso_conf_mat = confusion_matrix(test_labels, lasso_y_pred_test)
print("Confusion Matrix:\n", lasso_conf_mat)

# Calculate Sensitivity and Specificity
lasso_tn, lasso_fp, lasso_fn, lasso_tp = lasso_conf_mat.ravel()

lasso_sensitivity = lasso_tp / (lasso_tp + lasso_fn)
lasso_specificity = lasso_tn / (lasso_tn + lasso_fp)

print(f"Sensitivity (True Positive Rate): {lasso_sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {lasso_specificity:.4f}")
Confusion Matrix:
 [[16  8]
 [ 6 32]]
Sensitivity (True Positive Rate): 0.8421
Specificity (True Negative Rate): 0.6667

Modeling: Random Forest¶

In [ ]:
np.random.seed(1234)

rf_model = RandomForestClassifier(random_state=seed)

param_grid = {
    'max_features': list(range(1, 6))}

rf_grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')  
rf_grid_search.fit(x_train_validate, y_train_validate)

# Print the best hyperparameter
print("Best max_features (mtry):", rf_grid_search.best_params_['max_features'])
Best max_features (mtry): 3
In [ ]:
best_rf_model = RandomForestClassifier(max_features=rf_grid_search.best_params_['max_features'])
best_rf_model.fit(x_train_validate, y_train_validate)
Out[ ]:
RandomForestClassifier(max_features=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_features=3)
In [ ]:
rf_y_pred_test = best_rf_model.predict(test)

rf_accuracy_test = accuracy_score(test_labels, rf_y_pred_test)
rf_roc_auc_test = roc_auc_score(test_labels, rf_y_pred_test)

print("Accuracy on Test Set:", rf_accuracy_test)
print("ROC AUC on Test Set:", rf_roc_auc_test)
Accuracy on Test Set: 0.8225806451612904
ROC AUC on Test Set: 0.8015350877192983
In [ ]:
# Confusion Matrix
rf_conf_mat = confusion_matrix(test_labels, rf_y_pred_test)
print("Confusion Matrix:\n", rf_conf_mat)

# Calculate Sensitivity and Specificity
rf_tn, rf_fp, rf_fn, rf_tp = rf_conf_mat.ravel()

rf_sensitivity = rf_tp / (rf_tp + rf_fn)
rf_specificity = rf_tn / (rf_tn + rf_fp)

print(f"Sensitivity (True Positive Rate): {rf_sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {rf_specificity:.4f}")
Confusion Matrix:
 [[17  7]
 [ 4 34]]
Sensitivity (True Positive Rate): 0.8947
Specificity (True Negative Rate): 0.7083

Combine results to one table for comparison¶

In [ ]:
results_data = [['KNN', knn_accuracy_test, knn_roc_auc_test, knn_sensitivity, knn_specificity], 
        ['Lasso', lasso_accuracy_test, lasso_roc_auc_test, lasso_sensitivity, lasso_specificity],
        ['Random Forest', rf_accuracy_test, rf_roc_auc_test, rf_sensitivity, rf_specificity]]

results = pd.DataFrame(results_data, columns=['Model', 'Accuracy', 'ROC_AUC', 'Sensitivity', 'Specificity'])

results[['Accuracy', 'ROC_AUC', 'Sensitivity', 'Specificity']] = round(results[['Accuracy', 'ROC_AUC', 'Sensitivity', 'Specificity']]*100, 2)

results
Out[ ]:
Model Accuracy ROC_AUC Sensitivity Specificity
0 KNN 79.03 78.29 81.58 75.00
1 Lasso 77.42 75.44 84.21 66.67
2 Random Forest 82.26 80.15 89.47 70.83