# load libraries

import numpy as np
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from scipy import stats
import matplotlib.pyplot as plt
import sklearn
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn import metrics 
from sklearn import preprocessing 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


# load data

data = pd.read_csv(r"C:\Users\felix\OneDrive\Dokumente\Python Projects\BioMed Case\column_2C_weka.csv", sep=",")

data.head(10)


# replace normal and abnormal by binary values
 
for index, row in data.iterrows():
    if row['class'] == 'Normal':
        data.at[index, 'class'] = 0
    else:
        data.at[index, 'class'] = 1

data.head(10)


data.groupby('class')['class'].count()

class
0    100
1    210
Name: class, dtype: int64


def data_split(examples, labels, train_frac, random_state=None):

    
    assert train_frac >= 0 and train_frac <= 1, "Invalid training set fraction"

    X_train, X_tmp, Y_train, Y_tmp = sklearn.model_selection.train_test_split(
                                        examples, labels, train_size=train_frac, random_state=random_state)

    X_val, X_test, Y_val, Y_test   = sklearn.model_selection.train_test_split(
                                        X_tmp, Y_tmp, train_size=0.5, random_state=random_state)

    return X_train, X_val, X_test,  Y_train, Y_val, Y_test


np.random.seed(1234)
train, validate, test, train_labels, val_labels, test_labels = data_split(
    examples=data.drop('class', axis=1),  
    labels=data['class'], 
    train_frac=0.6, 
    random_state=1234
)


train_labels.value_counts()

class
1    124
0     62
Name: count, dtype: int64


val_labels.value_counts()

class
1    48
0    14
Name: count, dtype: int64


test_labels.value_counts()

class
1    38
0    24
Name: count, dtype: int64


train.dtypes

pelvic_incidence            float64
pelvic_tilt numeric         float64
lumbar_lordosis_angle       float64
sacral_slope                float64
pelvic_radius               float64
degree_spondylolisthesis    float64
dtype: object


# check for missing values

null_value_rows = train.iloc[:, 0:6].isna().any()

null_value_rows

pelvic_incidence            False
pelvic_tilt numeric         False
lumbar_lordosis_angle       False
sacral_slope                False
pelvic_radius               False
degree_spondylolisthesis    False
dtype: bool


train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 186 entries, 104 to 303
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pelvic_incidence          186 non-null    float64
 1   pelvic_tilt numeric       186 non-null    float64
 2   lumbar_lordosis_angle     186 non-null    float64
 3   sacral_slope              186 non-null    float64
 4   pelvic_radius             186 non-null    float64
 5   degree_spondylolisthesis  186 non-null    float64
dtypes: float64(6)
memory usage: 10.2 KB


train.describe(percentiles=[.25,.50,.75])


for i, col in enumerate(train.columns[0:6]):
    plt.figure(i)
    sns.distplot(train[col])


corr_df = pd.concat([train, train_labels], axis=1)
corr = corr_df.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)

[Text(0.5, 0, 'pelvic_incidence'),
 Text(1.5, 0, 'pelvic_tilt numeric'),
 Text(2.5, 0, 'lumbar_lordosis_angle'),
 Text(3.5, 0, 'sacral_slope'),
 Text(4.5, 0, 'pelvic_radius'),
 Text(5.5, 0, 'degree_spondylolisthesis'),
 Text(6.5, 0, 'class')]


# Concatenate the training and validation sets for cross-validation
x_train_validate = pd.concat([train, validate])
y_train_validate = pd.concat([train_labels, val_labels])

# make sure dependent variable is in integer dtype
y_train_validate = y_train_validate.astype(int)
test_labels = test_labels.astype(int)

# normalize independent variables as preprocessing step
x_train_validate = preprocessing.normalize(x_train_validate)
test = preprocessing.normalize(test)


# set seed for all models

np.random.seed(1234)
seed = 1234


knn_model = KNeighborsClassifier()

# set up a tuning grid
param_grid = {'n_neighbors': range(1, 21)}

# metric
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': make_scorer(roc_auc_score, greater_is_better=True)
}


# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(knn_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring=scoring, refit='accuracy')
grid_search.fit(x_train_validate, y_train_validate)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)}, refit='accuracy',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'roc_auc': make_scorer(roc_auc_score)})

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 21)}, refit='accuracy',
             scoring={'accuracy': make_scorer(accuracy_score),
                      'roc_auc': make_scorer(roc_auc_score)})

KNeighborsClassifier()

KNeighborsClassifier()


# Print the best k, accuracy, and ROC AUC
print("Best k:", grid_search.best_params_['n_neighbors'])
print("Best Accuracy:", grid_search.best_score_)
print("Best ROC AUC:", grid_search.cv_results_['mean_test_roc_auc'][grid_search.best_index_])

Best k: 17
Best Accuracy: 0.822938775510204
Best ROC AUC: 0.8025630252100839


best_k = grid_search.best_params_['n_neighbors']
final_knn_model = KNeighborsClassifier(n_neighbors=best_k)
final_knn_model.fit(x_train_validate, y_train_validate)

KNeighborsClassifier(n_neighbors=17)

KNeighborsClassifier(n_neighbors=17)


# Evaluate the final model on the test set
knn_y_pred_test = final_knn_model.predict(test)
knn_accuracy_test = accuracy_score(test_labels, knn_y_pred_test)
knn_roc_auc_test = roc_auc_score(test_labels, knn_y_pred_test)

print("Accuracy on Test Set:", knn_accuracy_test)
print("ROC AUC on Test Set:", knn_roc_auc_test)

Accuracy on Test Set: 0.7903225806451613
ROC AUC on Test Set: 0.7828947368421053


# Confusion Matrix
knn_conf_mat = confusion_matrix(test_labels, knn_y_pred_test)
print("Confusion Matrix:\n", knn_conf_mat)

Confusion Matrix:
 [[18  6]
 [ 7 31]]


# Calculate Sensitivity and Specificity
knn_tn, knn_fp, knn_fn, knn_tp = knn_conf_mat.ravel()

knn_sensitivity = knn_tp / (knn_tp + knn_fn)
knn_specificity = knn_tn / (knn_tn + knn_fp)

print(f"Sensitivity (True Positive Rate): {knn_sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {knn_specificity:.4f}")

Sensitivity (True Positive Rate): 0.8158
Specificity (True Negative Rate): 0.7500


lasso_model = LogisticRegression(penalty='l1', solver='liblinear')

lasso_param_grid = {'C': np.logspace(-3, 3, 100)}

lasso_grid_search = GridSearchCV(lasso_model, lasso_param_grid, cv=5, scoring='accuracy')  
lasso_grid_search.fit(x_train_validate, y_train_validate)

GridSearchCV(cv=5,
             estimator=LogisticRegression(penalty='l1', solver='liblinear'),
             param_grid={'C': array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669923e-03, 6.13590727e-03, 7.05480231e-03, 8.11130831e-03,
       9.32603...
       4.03701726e+01, 4.64158883e+01, 5.33669923e+01, 6.13590727e+01,
       7.05480231e+01, 8.11130831e+01, 9.32603347e+01, 1.07226722e+02,
       1.23284674e+02, 1.41747416e+02, 1.62975083e+02, 1.87381742e+02,
       2.15443469e+02, 2.47707636e+02, 2.84803587e+02, 3.27454916e+02,
       3.76493581e+02, 4.32876128e+02, 4.97702356e+02, 5.72236766e+02,
       6.57933225e+02, 7.56463328e+02, 8.69749003e+02, 1.00000000e+03])},
             scoring='accuracy')

GridSearchCV(cv=5,
             estimator=LogisticRegression(penalty='l1', solver='liblinear'),
             param_grid={'C': array([1.00000000e-03, 1.14975700e-03, 1.32194115e-03, 1.51991108e-03,
       1.74752840e-03, 2.00923300e-03, 2.31012970e-03, 2.65608778e-03,
       3.05385551e-03, 3.51119173e-03, 4.03701726e-03, 4.64158883e-03,
       5.33669923e-03, 6.13590727e-03, 7.05480231e-03, 8.11130831e-03,
       9.32603...
       4.03701726e+01, 4.64158883e+01, 5.33669923e+01, 6.13590727e+01,
       7.05480231e+01, 8.11130831e+01, 9.32603347e+01, 1.07226722e+02,
       1.23284674e+02, 1.41747416e+02, 1.62975083e+02, 1.87381742e+02,
       2.15443469e+02, 2.47707636e+02, 2.84803587e+02, 3.27454916e+02,
       3.76493581e+02, 4.32876128e+02, 4.97702356e+02, 5.72236766e+02,
       6.57933225e+02, 7.56463328e+02, 8.69749003e+02, 1.00000000e+03])},
             scoring='accuracy')

LogisticRegression(penalty='l1', solver='liblinear')

LogisticRegression(penalty='l1', solver='liblinear')


# Print the hyperparameter
best_C = lasso_grid_search.best_params_['C']
print("Best C:", best_C)

Best C: 572.236765935022


# Train the final Lasso model with the best hyperparameter on the entire training set
best_lasso_model = LogisticRegression(penalty='l1', solver='liblinear', C=best_C, random_state=seed)
best_lasso_model.fit(x_train_validate, y_train_validate)

LogisticRegression(C=572.236765935022, penalty='l1', random_state=1234,
                   solver='liblinear')

LogisticRegression(C=572.236765935022, penalty='l1', random_state=1234,
                   solver='liblinear')


# Evaluate the final Lasso model on the test set
lasso_y_pred_test = best_lasso_model.predict(test)

lasso_accuracy_test = accuracy_score(test_labels, lasso_y_pred_test)
lasso_roc_auc_test = roc_auc_score(test_labels, lasso_y_pred_test)

print("Accuracy on Test Set:", lasso_accuracy_test)
print("ROC AUC on Test Set:", lasso_roc_auc_test)

Accuracy on Test Set: 0.7741935483870968
ROC AUC on Test Set: 0.7543859649122807


# Confusion Matrix
lasso_conf_mat = confusion_matrix(test_labels, lasso_y_pred_test)
print("Confusion Matrix:\n", lasso_conf_mat)

# Calculate Sensitivity and Specificity
lasso_tn, lasso_fp, lasso_fn, lasso_tp = lasso_conf_mat.ravel()

lasso_sensitivity = lasso_tp / (lasso_tp + lasso_fn)
lasso_specificity = lasso_tn / (lasso_tn + lasso_fp)

print(f"Sensitivity (True Positive Rate): {lasso_sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {lasso_specificity:.4f}")

Confusion Matrix:
 [[16  8]
 [ 6 32]]
Sensitivity (True Positive Rate): 0.8421
Specificity (True Negative Rate): 0.6667


np.random.seed(1234)

rf_model = RandomForestClassifier(random_state=seed)

param_grid = {
    'max_features': list(range(1, 6))}

rf_grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')  
rf_grid_search.fit(x_train_validate, y_train_validate)

# Print the best hyperparameter
print("Best max_features (mtry):", rf_grid_search.best_params_['max_features'])

Best max_features (mtry): 3


best_rf_model = RandomForestClassifier(max_features=rf_grid_search.best_params_['max_features'])
best_rf_model.fit(x_train_validate, y_train_validate)

RandomForestClassifier(max_features=3)

RandomForestClassifier(max_features=3)


rf_y_pred_test = best_rf_model.predict(test)

rf_accuracy_test = accuracy_score(test_labels, rf_y_pred_test)
rf_roc_auc_test = roc_auc_score(test_labels, rf_y_pred_test)

print("Accuracy on Test Set:", rf_accuracy_test)
print("ROC AUC on Test Set:", rf_roc_auc_test)

Accuracy on Test Set: 0.8225806451612904
ROC AUC on Test Set: 0.8015350877192983


# Confusion Matrix
rf_conf_mat = confusion_matrix(test_labels, rf_y_pred_test)
print("Confusion Matrix:\n", rf_conf_mat)

# Calculate Sensitivity and Specificity
rf_tn, rf_fp, rf_fn, rf_tp = rf_conf_mat.ravel()

rf_sensitivity = rf_tp / (rf_tp + rf_fn)
rf_specificity = rf_tn / (rf_tn + rf_fp)

print(f"Sensitivity (True Positive Rate): {rf_sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {rf_specificity:.4f}")

Confusion Matrix:
 [[17  7]
 [ 4 34]]
Sensitivity (True Positive Rate): 0.8947
Specificity (True Negative Rate): 0.7083


results_data = [['KNN', knn_accuracy_test, knn_roc_auc_test, knn_sensitivity, knn_specificity], 
        ['Lasso', lasso_accuracy_test, lasso_roc_auc_test, lasso_sensitivity, lasso_specificity],
        ['Random Forest', rf_accuracy_test, rf_roc_auc_test, rf_sensitivity, rf_specificity]]

results = pd.DataFrame(results_data, columns=['Model', 'Accuracy', 'ROC_AUC', 'Sensitivity', 'Specificity'])

results[['Accuracy', 'ROC_AUC', 'Sensitivity', 'Specificity']] = round(results[['Accuracy', 'ROC_AUC', 'Sensitivity', 'Specificity']]*100, 2)

results

	pelvic_incidence	pelvic_tilt numeric	lumbar_lordosis_angle	sacral_slope	pelvic_radius	degree_spondylolisthesis	class
0	63.027817	22.552586	39.609117	40.475232	98.672917	-0.254400	Abnormal
1	39.056951	10.060991	25.015378	28.995960	114.405425	4.564259	Abnormal
2	68.832021	22.218482	50.092194	46.613539	105.985135	-3.530317	Abnormal
3	69.297008	24.652878	44.311238	44.644130	101.868495	11.211523	Abnormal
4	49.712859	9.652075	28.317406	40.060784	108.168725	7.918501	Abnormal
5	40.250200	13.921907	25.124950	26.328293	130.327871	2.230652	Abnormal
6	53.432928	15.864336	37.165934	37.568592	120.567523	5.988551	Abnormal
7	45.366754	10.755611	29.038349	34.611142	117.270067	-10.675871	Abnormal
8	43.790190	13.533753	42.690814	30.256437	125.002893	13.289018	Abnormal
9	36.686353	5.010884	41.948751	31.675469	84.241415	0.664437	Abnormal

	pelvic_incidence	pelvic_tilt numeric	lumbar_lordosis_angle	sacral_slope	pelvic_radius	degree_spondylolisthesis	class
0	63.027817	22.552586	39.609117	40.475232	98.672917	-0.254400	1
1	39.056951	10.060991	25.015378	28.995960	114.405425	4.564259	1
2	68.832021	22.218482	50.092194	46.613539	105.985135	-3.530317	1
3	69.297008	24.652878	44.311238	44.644130	101.868495	11.211523	1
4	49.712859	9.652075	28.317406	40.060784	108.168725	7.918501	1
5	40.250200	13.921907	25.124950	26.328293	130.327871	2.230652	1
6	53.432928	15.864336	37.165934	37.568592	120.567523	5.988551	1
7	45.366754	10.755611	29.038349	34.611142	117.270067	-10.675871	1
8	43.790190	13.533753	42.690814	30.256437	125.002893	13.289018	1
9	36.686353	5.010884	41.948751	31.675469	84.241415	0.664437	1

	pelvic_incidence	pelvic_tilt numeric	lumbar_lordosis_angle	sacral_slope	pelvic_radius	degree_spondylolisthesis
count	186.000000	186.000000	186.000000	186.000000	186.000000	186.000000
mean	59.139559	17.242256	51.405375	41.897303	118.590296	23.929936
std	15.969392	9.190825	18.970799	12.009732	13.034390	30.057371
min	26.147921	-0.810514	14.000000	13.516568	82.456038	-11.058179
25%	46.387157	10.390261	37.000000	32.886361	111.530796	1.530704
50%	57.167662	16.031278	47.627403	41.282220	118.784828	10.637436
75%	70.425687	21.934926	62.223469	50.675994	125.667927	38.240343
max	96.657315	42.689195	125.742385	77.195734	163.071041	145.378143

	Model	Accuracy	ROC_AUC	Sensitivity	Specificity
0	KNN	79.03	78.29	81.58	75.00
1	Lasso	77.42	75.44	84.21	66.67
2	Random Forest	82.26	80.15	89.47	70.83

Biomechanical features of orthopedic patients¶

Modeling: KNN¶

Modeling: Lasso¶

Modeling: Random Forest¶

Combine results to one table for comparison¶