# load required packages

import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# load the trip advisor data

data_raw = pd.read_csv('C:/Users/felix/OneDrive/Dokumente/Python Projects/Causal_Discovery_And_Inference/TripAdvisor_data.csv', delimiter=',', encoding='cp1252')

data_raw.head()


data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4599 entries, 0 to 4598
Columns: 272 entries, Unnamed: 0 to amenities_Yoga classes
dtypes: float64(8), int64(261), object(3)
memory usage: 9.5+ MB


data_raw.describe()

# as can be seen below, 3 features, such as hotel_url and name, cannot be statistically described because of their categorical nature


data_raw = data_raw.rename(columns={'Unnamed: 0':'ID'})


top_dtypes = data_raw.dtypes.head(30)
bottom_dtypes = data_raw.dtypes.tail(5)

pd.concat([top_dtypes, bottom_dtypes])          ## it appears that the at a certain point (after the first 15-30 features), there are only binary columns left until the end of the feature list. We will quickly investigate that

ID                                 int64
hotel_url                         object
name                              object
views                              int64
views_binary                       int64
score_adjusted                   float64
bubble_rating                    float64
category_hotel                     int64
category_inn                       int64
category_specialty                 int64
class                             object
class_4_5                          int64
class_3_4_5                        int64
n_reviews                          int64
location_grade                   float64
discount                           int64
discount_perc                    float64
price_curr_min                   float64
price_min                        float64
price_max                        float64
award_travellers_choice            int64
award_greenleaders                 int64
award_cert_excellence              int64
photos                           float64
amenities_24-hour check-in         int64
amenities_24-hour front desk       int64
amenities_24-hour security         int64
amenities_Additional bathroom      int64
amenities_Adults only              int64
amenities_Aerobics                 int64
amenities_Waxing services          int64
amenities_Whirlpool bathtub        int64
amenities_Wifi                     int64
amenities_Wine / champagne         int64
amenities_Yoga classes             int64
dtype: object


# check if last approx. 250 features are all binary amenities features (dummy variables)

# count features containing word byte 'aminities'
feature_list = list(data_raw.columns)

aminities_feature_count = 0

for i in feature_list:
    if 'amenities' in i:
        aminities_feature_count += 1

# check each max and see if they add up to the same number found in the column count
binary_count = 0
not_binary = []

for c in data_raw.columns:
    if 'amenities' in c and data_raw[c].max() == 1:
        binary_count += 1
    elif 'amenities' in c:
        not_binary.append(c)
         

print(aminities_feature_count)
print(binary_count)
print(not_binary)

248
243
['amenities_amenity_free_internet_title_ad3', 'amenities_hotel_amenity_free_breakfast', 'amenities_hotel_amenity_free_internet', 'amenities_hotel_amenity_internet', 'amenities_hotel_amenity_wifi']


# Create summary statistics for the not_binary columns of ameninities

for i in not_binary:
    print(data_raw[i].describe())

count    4599.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: amenities_amenity_free_internet_title_ad3, dtype: float64
count    4599.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: amenities_hotel_amenity_free_breakfast, dtype: float64
count    4599.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: amenities_hotel_amenity_free_internet, dtype: float64
count    4599.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: amenities_hotel_amenity_internet, dtype: float64
count    4599.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Name: amenities_hotel_amenity_wifi, dtype: float64


# Drop the 5 columns from not_binary: df0

data_raw0 = data_raw.drop(data_raw[not_binary], axis=1)

len(data_raw0.columns)

267


# explore the variables views and views_binary

data_raw0[['views', 'views_binary']]


# drop views_binary from data: df1

data_raw1 = data_raw0.drop('views_binary', axis=1)

len(data_raw1.columns)

266


# Visualize views with histogram

sns.histplot(data=data_raw1, x='views', bins=10)

<Axes: xlabel='views', ylabel='Count'>


below20views = data_raw1[data_raw1['views'] < 20]

sns.histplot(below20views, x='views', bins=10)

<Axes: xlabel='views', ylabel='Count'>


# show skewedness with kdeplot

sns.kdeplot(data=data_raw1, x='views')

<Axes: xlabel='views', ylabel='Density'>


sns.scatterplot(data=data_raw1, x='views', y='location_grade' )

<Axes: xlabel='views', ylabel='location_grade'>


data_raw1['class'].unique()

array(['3', '4', '5', '2', '1', 'no stars'], dtype=object)


data_raw1['class'].replace({'no stars': 0}, inplace=True)

data_raw1['class'] = data_raw1['class'].astype(int)

data_raw1['class'].unique()

array([3, 4, 5, 2, 1, 0])


sns.scatterplot(data=data_raw1, x='views', y='class' )

<Axes: xlabel='views', ylabel='class'>


# detect outliers in views using the z-score

def calculate_z_score(data):
    mean = np.mean(data)
    std_dev = np.std(data)
    z_scores = (data - mean) / std_dev
    return z_scores

z_scores = calculate_z_score(data_raw1['views'])


z_scores

0       -0.123092
1       39.320014
2       14.219856
3        7.496599
4       -0.123092
          ...    
4594    -0.123092
4595    -0.123092
4596    -0.123092
4597    -0.123092
4598    -0.123092
Name: views, Length: 4599, dtype: float64


outliers_indeces = []

mean_views = np.mean(data_raw1['views'])
std_dev_views = np.std(data_raw1['views'])

for idx, z in enumerate(z_scores):
    if z > 3:
        outliers_indeces.append(idx)

len(outliers_indeces)

64


# drop outliers from data_raw1: data_raw2

data_raw1.drop(outliers_indeces, axis=0, inplace=True)

len(data_raw1)

4535


# recreate scatterplot by hotel class to visualize data without outliers

sns.barplot(data=data_raw1, x='class', y='views', errorbar=None)

<Axes: xlabel='class', ylabel='views'>


# cols with many NA values
n_rows = len(data_raw1)
missing_values_features = []

for c in data_raw1.columns:
    if data_raw1[c].count()/n_rows < 0.75:
        missing_values_features.append(c)

print(missing_values_features)

['price_curr_min']


data_raw1.drop(columns='price_curr_min', inplace=True)


data_raw1['views'].value_counts()

views
0    4427
3      31
2      29
5      25
4      13
6      10
Name: count, dtype: int64


# use data_raw1 but add the binary variable again

data_raw1['views'] = np.where(data_raw1['views'] > 0, 1, data_raw1['views'])


# check the binary distribution in views after transformation

view_counts = data_raw1['views'].value_counts()
view_percs = data_raw1['views'].value_counts(normalize=True)
pd.concat([view_counts,round(view_percs,2)], axis=1, keys=['count', 'percentage'])


# import necessary ML packages

from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.impute import KNNImputer
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score


data_raw1.columns

Index(['ID', 'hotel_url', 'name', 'views', 'score_adjusted', 'bubble_rating',
       'category_hotel', 'category_inn', 'category_specialty', 'class',
       ...
       'amenities_Wardrobe / closet', 'amenities_Washing machine',
       'amenities_Water park', 'amenities_Water park offsite',
       'amenities_Waterslide', 'amenities_Waxing services',
       'amenities_Whirlpool bathtub', 'amenities_Wifi',
       'amenities_Wine / champagne', 'amenities_Yoga classes'],
      dtype='object', length=265)


data_raw1.drop(columns=['class_4_5', 'class_3_4_5'], inplace=True)


# nan values

data_raw1.isna().sum().sort_values(ascending=False).head(10)            ## we should exclude price_max and price_min right away due to the low data availability

price_max                                1127
price_min                                1125
photos                                    577
location_grade                            216
amenities_Patio                             0
amenities_Parking garage                    0
amenities_Parking                           0
amenities_Paid wifi                         0
amenities_Paid public parking on-site       0
amenities_Paid public parking nearby        0
dtype: int64


# check for photos for potential imputation

data_raw1['photos'].value_counts().sort_index()                     ## impute 1 instead of NaN, as there are no 0 counts in the data (by ASSUMPTION that every hotel shows at least 1 photo)

photos
2.0        75
3.0        73
4.0        64
5.0       272
6.0       153
         ... 
1372.0      1
1382.0      1
1423.0      1
1520.0      1
1589.0      1
Name: count, Length: 553, dtype: int64


# check for location_grade for potential imputation

data_raw1['location_grade'].value_counts().sort_index()         ## impute the location_grade given by the hotel's 'Nearest Neighbor', in a statistical way of speaking

location_grade
36.0        8
37.0       11
38.0       10
39.0       13
40.0       10
         ... 
96.0       38
97.0       53
98.0       48
99.0       78
100.0    2959
Name: count, Length: 65, dtype: int64


# drop columns for ML application
data = data_raw1.drop(columns=['ID', 'hotel_url', 'name', 'views', 'price_max', 'price_min'])


# photos impute
data['photos'] = data['photos'].fillna(1)


# check remaining nan values

data.isna().sum().sort_values(ascending=False).head(3)

location_grade                           216
score_adjusted                             0
amenities_Paid private parking nearby      0
dtype: int64


# KNNImputer to impute location_grade
imputer = KNNImputer(n_neighbors=10, weights='uniform').set_output(transform='pandas')
data = imputer.fit_transform(data)


# check remaining nan values

data.isna().sum().sort_values(ascending=False).head(3)

score_adjusted                   0
amenities_Indoor pool            0
amenities_Outdoor dining area    0
dtype: int64


# Split df into features and target variable
X = data
y = data_raw1['views']


# check for y class distribution
y.value_counts()

views
0    4427
1     108
Name: count, dtype: int64


# Split the data into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=100, stratify=y)


round(y_train.value_counts(normalize=True),2)

views
0    0.98
1    0.02
Name: proportion, dtype: float64


round(y_test.value_counts(normalize=True),2)

views
0    0.98
1    0.02
Name: proportion, dtype: float64


# Perform under sampling to create balanced dataset
rus = RandomUnderSampler(random_state=100)

X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)


y_train_rus.value_counts()

views
0    86
1    86
Name: count, dtype: int64


ros = RandomOverSampler(random_state=100)

X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)


y_train_ros.value_counts()

views
0    3542
1    3542
Name: count, dtype: int64


cv_params = {'max_depth': [3,5,7], 
             'min_samples_leaf': [2,3,4],
             'min_samples_split': [2,3,4],
             'max_features': [4,8,12],
             'n_estimators': [50, 75, 100]
             }

# Deine the scoring metrics
scoring = 'recall'

rf_u = GridSearchCV(RandomForestClassifier(),
                           param_grid=cv_params, cv=5, scoring=scoring)
rf_o = GridSearchCV(RandomForestClassifier(),
                           param_grid=cv_params, cv=5, scoring=scoring)


# Fit undersampled model
rf_u.fit(X_train_rus, y_train_rus)

# Fit oversampled model
rf_o.fit(X_train_ros, y_train_ros)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 7], 'max_features': [4, 8, 12],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [50, 75, 100]},
             scoring='recall')

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 7], 'max_features': [4, 8, 12],
                         'min_samples_leaf': [2, 3, 4],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': [50, 75, 100]},
             scoring='recall')

RandomForestClassifier()

RandomForestClassifier()


print('Random Forest Undersampling:', rf_u.best_estimator_, 'score:', rf_u.best_score_)

print('Random Forest Oversampling:', rf_o.best_estimator_, 'score:', rf_o.best_score_)

Random Forest Undersampling: RandomForestClassifier(max_depth=3, max_features=4, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=50) score: 0.9411764705882353
Random Forest Oversampling: RandomForestClassifier(max_depth=7, max_features=12, min_samples_leaf=3,
                       min_samples_split=4, n_estimators=50) score: 0.982781509725642


# undersampling prediction on test set

rf_u_model = rf_u.best_estimator_
rf_u_model.random_state = 99
rf_u_model.fit(X_train_rus, y_train_rus)

rf_u_pred = rf_u_model.predict(X_test)


print('Recall:', recall_score(y_test,   
                              rf_u_pred,
                              average="weighted"))

Recall: 0.8974641675854466


# oversampling prediction on test set

rf_o_model = rf_o.best_estimator_
rf_u_model.random_state = 99
rf_o_model.fit(X_train_ros, y_train_ros)

rf_o_pred = rf_o_model.predict(X_test)


print('Recall:', recall_score(y_test,
                              rf_o_pred,
                              average="weighted"))

Recall: 0.9437706725468578


cv_params_xgb = {
        'subsample': [0.3, 0.5, 0.7],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.05, 0.01,],
        'alpha': [0.25, 0.5, 0.75] 
        }


# Deine the scoring metrics
scoring = 'recall'

xgb_u = GridSearchCV(XGBClassifier(),
                           param_grid=cv_params_xgb, cv=5, scoring=scoring)
xgb_o = GridSearchCV(XGBClassifier(),
                           param_grid=cv_params_xgb, cv=5, scoring=scoring)


# Fit undersampled model
xgb_u.fit(X_train_rus, y_train_rus)

# Fit oversampled model
xgb_o.fit(X_train_ros, y_train_ros)

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, feature_weights=None,
                                     gamma=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=Non...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None, ...),
             param_grid={'alpha': [0.25, 0.5, 0.75],
                         'colsample_bytree': [0.6, 0.8, 1.0],
                         'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 5, 7], 'subsample': [0.3, 0.5, 0.7]},
             scoring='recall')

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, feature_weights=None,
                                     gamma=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=Non...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None, ...),
             param_grid={'alpha': [0.25, 0.5, 0.75],
                         'colsample_bytree': [0.6, 0.8, 1.0],
                         'learning_rate': [0.1, 0.05, 0.01],
                         'max_depth': [3, 5, 7], 'subsample': [0.3, 0.5, 0.7]},
             scoring='recall')

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...)


print('XGB Undersampling:', xgb_u.best_params_, 'score:', xgb_u.best_score_)

print('XGB Oversampling:', xgb_o.best_params_, 'score:', xgb_o.best_score_)

XGB Undersampling: {'alpha': 0.5, 'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.7} score: 0.9411764705882353
XGB Oversampling: {'alpha': 0.25, 'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.3} score: 1.0


# undersampling prediction on test set

xgb_u_model = XGBClassifier(**xgb_u.best_params_)

xgb_u_model.fit(X_train_rus, y_train_rus)

xgb_u_pred = xgb_u_model.predict(X_test)

print('Recall:', recall_score(y_test,
                              xgb_u_pred,
                              average="weighted"))

Recall: 0.9206174200661521


# oversampling prediction on test set

xgb_o_model = XGBClassifier(**xgb_o.best_params_)

xgb_o_model.fit(X_train_ros, y_train_ros)

xgb_o_pred = xgb_o_model.predict(X_test)

print('Recall:', recall_score(y_test,
                              xgb_o_pred,
                              average="weighted"))

Recall: 0.9614112458654906


# combine the metrics into one table/overview

metrics = pd.DataFrame({
    'Model': ['Random Forest (Undersampling)', 'Random Forest (Oversampling)', 
              'XGBoost (Undersampling)', 'XGBoost (Oversampling)'],
    'Recall': [recall_score(y_test, rf_u_pred, average="weighted"),
               recall_score(y_test, rf_o_pred, average="weighted"),
               recall_score(y_test, xgb_u_pred, average="weighted"),
               recall_score(y_test, xgb_o_pred, average="weighted")],

})

metrics


best_model = xgb_o_model


# Visualize the feature importance of the XGBoost oversampling model

plot_importance(best_model, importance_type='weight', max_num_features=10)
plt.title("Feature Importance (XGBoost Oversampling Model)")
plt.show()


data.columns

Index(['score_adjusted', 'bubble_rating', 'category_hotel', 'category_inn',
       'category_specialty', 'class', 'n_reviews', 'location_grade',
       'discount', 'discount_perc',
       ...
       'amenities_Wardrobe / closet', 'amenities_Washing machine',
       'amenities_Water park', 'amenities_Water park offsite',
       'amenities_Waterslide', 'amenities_Waxing services',
       'amenities_Whirlpool bathtub', 'amenities_Wifi',
       'amenities_Wine / champagne', 'amenities_Yoga classes'],
      dtype='object', length=257)


# First create a new dataset with only the target variable as well as the 6 selected features

cd_data = pd.concat([data[['n_reviews', 'score_adjusted', 'photos', 'location_grade']], y], axis=1)


cd_data


cd_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4535 entries, 0 to 4598
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   n_reviews       4535 non-null   float64
 1   score_adjusted  4535 non-null   float64
 2   photos          4535 non-null   float64
 3   location_grade  4535 non-null   float64
 4   views           4535 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 212.6 KB


# import library for causal discovery and inference
from lingam import DirectLiNGAM
import networkx as nx
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
cd_data_scaled = pd.DataFrame(
    scaler.fit_transform(cd_data),
    columns=cd_data.columns)

# Fit DirectLiNGAM on scaled data
model = DirectLiNGAM()
model.fit(cd_data_scaled)

# Extract adjacency matrix
adj_matrix = model.adjacency_matrix_
causal_df = pd.DataFrame(adj_matrix, index=cd_data.columns, columns=cd_data.columns)

# Create a causal graph
DAG = nx.DiGraph()

for i, source in enumerate(cd_data.columns):
    for j, target in enumerate(cd_data.columns):
        weight = adj_matrix[i, j]
        if abs(weight) > 0.0:
            DAG.add_edge(source, target, weight=round(weight, 3))

# Draw graph
pos = nx.spring_layout(DAG)
edge_labels = nx.get_edge_attributes(DAG, 'weight')

plt.figure(figsize=(10, 6))
nx.draw(DAG, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=7000, font_size=10)
nx.draw_networkx_edge_labels(DAG, pos, edge_labels=edge_labels, font_color='red')
plt.title("Causal Graph via DirectLiNGAM")
plt.show()


causal_df = pd.DataFrame(
    adj_matrix,
    index=cd_data.columns,
    columns=cd_data.columns
)

effects_df = causal_df[(causal_df != 0)].stack().reset_index()
effects_df.columns = ['Cause', 'Effect', 'Strength']

print(effects_df.sort_values(by='Strength', ascending=False))

       Cause          Effect  Strength
1     photos       n_reviews  0.876846
0  n_reviews           views  0.459328
3     photos           views  0.038678
2     photos  score_adjusted  0.022530

	Unnamed: 0	views	views_binary	score_adjusted	bubble_rating	category_hotel	category_inn	category_specialty	class_4_5	class_3_4_5	...	amenities_Wardrobe / closet	amenities_Washing machine	amenities_Water park	amenities_Water park offsite	amenities_Waterslide	amenities_Waxing services	amenities_Whirlpool bathtub	amenities_Wifi	amenities_Wine / champagne	amenities_Yoga classes
count	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	...	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000	4599.000000
mean	2300.000000	0.274625	0.037399	4.058395	4.067841	0.190041	0.619482	0.190476	0.111763	0.295934	...	0.025440	0.020874	0.000435	0.000870	0.000217	0.002609	0.004784	0.648619	0.072624	0.001522
std	1327.761274	2.231304	0.189759	0.837018	0.850739	0.392376	0.485567	0.392719	0.315109	0.456511	...	0.157475	0.142978	0.020851	0.029482	0.014746	0.051020	0.069006	0.477454	0.259547	0.038988
min	1.000000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1150.500000	0.000000	0.000000	3.673771	3.500000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	2300.000000	0.000000	0.000000	4.282051	4.500000	0.000000	1.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
75%	3449.500000	0.000000	0.000000	4.666667	4.500000	0.000000	1.000000	0.000000	0.000000	1.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000
max	4599.000000	88.000000	1.000000	5.000000	5.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

Trip Advisor: What consitutes hotel popularity?¶

Explore the target variable: Views¶

Define and remove outliers in 'views'¶

Remove columns with too little data¶

Machine Learning for Feature Importance¶

Working with an imbalanced dataset: Stratification¶

Data Preprocessing¶

Data Split for ML model training and testing¶

Random Under Sampler¶

Random Over Sampler¶

ML with Undersampling and Oversampling¶

Choice of Metrics¶

Applying Random Forest¶

Applying XGBoost¶

Ultimate Comparison: Over- vs. Undersampling & Random Forest vs. Extreme Gradient Boosting¶

Causal Discovery¶

	Unnamed: 0	hotel_url	name	views	views_binary	score_adjusted	bubble_rating	category_hotel	category_inn	...	amenities_Wardrobe / closet	amenities_Washing machine	amenities_Wifi	amenities_Wine / champagne
0	1	https://www.tripadvisor.com/Hotel_Review-g1877...	Casa Mia in Trastevere	0	0	4.409091	4.5	0	1	...	0	0	1	1
1	2	https://www.tripadvisor.com/Hotel_Review-g1877...	Hotel Artemide	88	1	4.798118	5.0	1	0	...	1	1	1	0
2	3	https://www.tripadvisor.com/Hotel_Review-g1877...	A.Roma Lifestyle Hotel	32	1	4.634085	4.5	1	0	...	0	0	1	0
3	4	https://www.tripadvisor.com/Hotel_Review-g1877...	iQ Hotel Roma	17	1	4.699138	4.5	1	0	...	1	1	1	1
4	5	https://www.tripadvisor.com/Hotel_Review-g1877...	The Guardian	0	0	4.624299	4.5	1	0	...	0	0	1	0

	Model	Recall
0	Random Forest (Undersampling)	0.897464
1	Random Forest (Oversampling)	0.943771
2	XGBoost (Undersampling)	0.920617
3	XGBoost (Oversampling)	0.961411

	n_reviews	score_adjusted	photos	location_grade	views
0	154.0	4.409091	135.0	100.0	0
4	1068.0	4.624299	784.0	100.0	0
6	3457.0	4.744650	1382.0	100.0	0
12	2905.0	4.507221	1148.0	100.0	1
14	2033.0	4.046706	669.0	100.0	1
...	...	...	...	...	...
4594	15.0	1.733333	3.0	96.0	0
4595	9.0	1.888889	2.0	100.0	0
4596	11.0	2.090909	9.0	100.0	0
4597	98.0	2.000000	54.0	100.0	0
4598	376.0	1.686170	268.0	100.0	0