# import packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

C:\Users\felix\AppData\Roaming\Python\Python311\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (


data_raw = pd.read_csv(r'C:\Users\felix\OneDrive\Dokumente\Python Projects\Walmart Sales Forecast\walmart_cleaned.csv')
data_raw = data_raw.drop('Unnamed: 0', axis=1)
data_raw.head()


data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421570 entries, 0 to 421569
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         421570 non-null  int64  
 1   Date          421570 non-null  object 
 2   IsHoliday     421570 non-null  int64  
 3   Dept          421570 non-null  float64
 4   Weekly_Sales  421570 non-null  float64
 5   Temperature   421570 non-null  float64
 6   Fuel_Price    421570 non-null  float64
 7   MarkDown1     421570 non-null  float64
 8   MarkDown2     421570 non-null  float64
 9   MarkDown3     421570 non-null  float64
 10  MarkDown4     421570 non-null  float64
 11  MarkDown5     421570 non-null  float64
 12  CPI           421570 non-null  float64
 13  Unemployment  421570 non-null  float64
 14  Type          421570 non-null  int64  
 15  Size          421570 non-null  int64  
dtypes: float64(11), int64(4), object(1)
memory usage: 51.5+ MB


data_raw.groupby('Store')['Store'].count()

Store
1     10244
2     10238
3      9036
4     10272
5      8999
6     10211
7      9762
8      9895
9      8867
10    10315
11    10062
12     9705
13    10474
14    10040
15     9901
16     9443
17     9864
18     9859
19    10148
20    10214
21     9582
22     9688
23    10050
24    10228
25     9804
26     9854
27    10225
28    10113
29     9455
30     7156
31    10142
32    10202
33     6487
34    10224
35     9528
36     6222
37     7206
38     7362
39     9878
40    10017
41    10088
42     6953
43     6751
44     7169
45     9637
Name: Store, dtype: int64


data_raw.describe()


data_raw['Date'] = pd.to_datetime(data_raw['Date']) 

figure, axis = plt.subplots(2, 2, figsize=(16, 10)) 
sales = data_raw.groupby('Date')['Weekly_Sales'].sum()
fuel_price = data_raw.groupby('Date')['Fuel_Price'].mean()
cpi = data_raw.groupby('Date')['CPI'].mean()
unemployment = data_raw.groupby('Date')['Unemployment'].mean()
time = data_raw['Date'].unique()

quarters = pd.PeriodIndex(time, freq='Q')

axis[0, 0].plot(time, sales)
axis[0, 0].set_title("Sales over time")
axis[0, 0].set_xticks(quarters)
axis[0, 0].set_xticklabels(quarters, rotation=45, fontsize = 9)

axis[0, 1].plot(time, fuel_price)
axis[0, 1].set_title("Fuel Price over time")
axis[0, 1].set_xticks(quarters)
axis[0, 1].set_xticklabels(quarters, rotation=45, fontsize = 9)

axis[1, 0].plot(time, cpi)
axis[1, 0].set_title("CPI over time")
axis[1, 0].set_xticks(quarters)
axis[1, 0].set_xticklabels(quarters, rotation=45, fontsize = 9)

axis[1, 1].plot(time, unemployment)
axis[1, 1].set_title("Unemployment over time")
axis[1, 1].set_xticks(quarters)
axis[1, 1].set_xticklabels(quarters, rotation=45, fontsize = 9)

plt.tight_layout()
plt.show()


# Correlation plot 

corr_matrix = data_raw.corr()

# Plot the heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap="crest",  fmt=".2f", linewidths=0.1)
plt.title('Correlation Matrix')
plt.show()


sales

Date
2010-02-05    49750740.50
2010-02-12    48336677.63
2010-02-19    48276993.78
2010-02-26    43968571.13
2010-03-05    46871470.30
                 ...     
2012-09-28    43734899.40
2012-10-05    47566639.31
2012-10-12    46128514.25
2012-10-19    45122410.57
2012-10-26    45544116.29
Name: Weekly_Sales, Length: 143, dtype: float64


fuel_price

Date
2010-02-05    2.717869
2010-02-12    2.696102
2010-02-19    2.673666
2010-02-26    2.685642
2010-03-05    2.731816
                ...   
2012-09-28    3.858245
2012-10-05    3.848435
2012-10-12    3.897425
2012-10-19    3.878413
2012-10-26    3.791086
Name: Fuel_Price, Length: 143, dtype: float64

cpi

Date
2010-02-05    167.398405
2010-02-12    167.384138
2010-02-19    167.338966
2010-02-26    167.691019
2010-03-05    167.727351
                 ...    
2012-09-28    175.919639
2012-10-05    175.998254
2012-10-12    176.123786
2012-10-19    176.295499
2012-10-26    176.182886
Name: CPI, Length: 143, dtype: float64


unemployment

Date
2010-02-05    8.576731
2010-02-12    8.567309
2010-02-19    8.576351
2010-02-26    8.561375
2010-03-05    8.572689
                ...   
2012-09-28    7.225855
2012-10-05    6.942141
2012-10-12    6.952682
2012-10-19    6.948928
2012-10-26    6.943212
Name: Unemployment, Length: 143, dtype: float64


# choose 80% of the data for training and 20% for testing

train_dates = sales.index[:int(0.8*(len(sales)))]
test_dates = sales.index[int(0.8*(len(sales))):]
                        

train_sales = sales[:int(0.8*(len(sales)))]
test_sales = sales[int(0.8*(len(sales))):]

train_fuel_price = fuel_price[:int(0.8*(len(fuel_price)))]
test_fuel_price = fuel_price[int(0.8*(len(fuel_price))):]

train_cpi = cpi[:int(0.8*(len(cpi)))]
test_cpi = cpi[int(0.8*(len(cpi))):]

train_unemployment = unemployment[:int(0.8*(len(unemployment)))]
test_unemployment = unemployment[int(0.8*(len(unemployment))):]


# visualize the sales data in different colors

plt.figure(figsize=(16, 8))
plt.plot(train_sales, label='Train Sales', color='blue')
plt.plot(test_sales, label='Test Sales', color='green')
plt.legend()
plt.show()


train_dates

DatetimeIndex(['2010-02-05', '2010-02-12', '2010-02-19', '2010-02-26',
               '2010-03-05', '2010-03-12', '2010-03-19', '2010-03-26',
               '2010-04-02', '2010-04-09',
               ...
               '2012-02-03', '2012-02-10', '2012-02-17', '2012-02-24',
               '2012-03-02', '2012-03-09', '2012-03-16', '2012-03-23',
               '2012-03-30', '2012-04-06'],
              dtype='datetime64[ns]', name='Date', length=114, freq=None)


len(test_dates)

29


# make dates ordinal for the model

# train_dates = train_dates.map(dt.datetime.toordinal)
# test_dates = test_dates.map(dt.datetime.toordinal)


# import packages for linear regression with statsmodels

import statsmodels.api as sm


# define the independent variables

X = np.column_stack((train_cpi, train_fuel_price, train_unemployment))
X = sm.add_constant(X)

# define the dependent variable

y = train_sales

# fit the model

model = sm.OLS(y, X)
results = model.fit()

# print the summary of the model

print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:           Weekly_Sales   R-squared:                       0.081
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     3.229
Date:                Thu, 16 Jan 2025   Prob (F-statistic):             0.0253
Time:                        12:35:44   Log-Likelihood:                -1936.3
No. Observations:                 114   AIC:                             3881.
Df Residuals:                     110   BIC:                             3892.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -6.66e+08   2.58e+08     -2.581      0.011   -1.18e+09   -1.55e+08
x1          3.439e+06   1.19e+06      2.884      0.005    1.08e+06     5.8e+06
x2         -6.235e+06   2.39e+06     -2.606      0.010    -1.1e+07   -1.49e+06
x3          1.825e+07    7.6e+06      2.402      0.018    3.19e+06    3.33e+07
==============================================================================
Omnibus:                       99.305   Durbin-Watson:                   1.476
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              910.495
Skew:                           2.979   Prob(JB):                    1.94e-198
Kurtosis:                      15.498   Cond. No.                     8.01e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.01e+04. This might indicate that there are
strong multicollinearity or other numerical problems.


# predict 

X_test = np.column_stack((test_cpi, test_fuel_price, test_unemployment))
X_test = sm.add_constant(X_test)

y_pred = results.predict(X_test)

# visualize the prediction
plt.figure(figsize=(16, 8))
plt.plot(test_dates, test_sales.values, label='Test Sales', color='green')
plt.plot(test_dates, y_pred, label='Predicted Sales', color='red')
plt.xlabel('Date')  # Label for x-axis
plt.ylabel('Sales')  # Label for y-axis
plt.title('Test Sales vs. Predicted Sales')
plt.legend()
plt.show()


# calculate the mean squared error, rmse, mape

from sklearn.metrics import mean_squared_error
from math import sqrt


mse_lm = mean_squared_error(test_sales, y_pred)
rmse_lm = sqrt(mse_lm)
mape_lm = np.mean(np.abs((test_sales - y_pred) / test_sales)) * 100


mape_lm

3.8847086034875957


from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import SimpleExpSmoothing   
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error


seasonal_decompose(sales, period=52).plot()


# Holt-Winters Exponential Smoothing

HW_add = ExponentialSmoothing(train_sales, seasonal='add', seasonal_periods=52)
HW_add_model = HW_add.fit()
HW_add_pred = HW_add_model.forecast(len(test_sales))

c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\holtwinters\model.py:915: ConvergenceWarning: Optimization failed to converge. Check mle_retvals.
  warnings.warn(


# visualize the prediction
plt.figure(figsize=(16, 8))
plt.plot(test_dates, test_sales.values, label='Test Sales', color='green')
plt.plot(test_dates, HW_add_pred, label='Predicted Sales', color='red')
plt.xlabel('Date')  # Label for x-axis
plt.ylabel('Sales')  # Label for y-axis
plt.title('Test Sales vs. Predicted Sales')
plt.legend()
plt.show()


# calculate the mean squared error, rmse, mean absolute percentage error

mse_hw = mean_squared_error(test_sales, HW_add_pred)
rmse_hw = sqrt(mse_hw)
mape_hw = mean_absolute_percentage_error(test_sales, HW_add_pred)*100


mape_hw

2.4886317003773626


# Perform ARIMA model 

from pmdarima import auto_arima
from pmdarima.arima.utils import ndiffs, nsdiffs
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


# explore stationarity of the data with ndiffs and nsdiffs to determine the number of regular and seasonal differences needed

adf = ndiffs(train_sales, test='adf')
pp = ndiffs(train_sales, test='pp')
kpss = ndiffs(train_sales, test='kpss')

sd52 = nsdiffs(train_sales, m=52)
sd7= nsdiffs(train_sales, m=7)

print("Number of regular differences needed:", max(adf, pp, kpss))
print("Number of seasonal differences needed:", max(sd52, sd7))

Number of regular differences needed: 0
Number of seasonal differences needed: 0


# plot ACF and PACF to determine the order of the ARIMA model

plot_acf(train_sales, lags=52)

plot_pacf(train_sales, lags=52)

c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\graphics\tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
  warnings.warn(


# Fit auto_arima model

arima_model = auto_arima(train_sales, seasonal=False, information_criterion='bic',
                         start_p=0, start_q=0, max_p=1, max_q=2,
                         stepwise=True, suppress_warnings=True, trace=True, error_action="ignore")

Performing stepwise search to minimize bic
 ARIMA(0,0,0)(0,0,0)[0]             : BIC=4358.919, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0]             : BIC=inf, Time=0.02 sec
 ARIMA(0,0,1)(0,0,0)[0]             : BIC=4289.811, Time=0.02 sec
 ARIMA(1,0,1)(0,0,0)[0]             : BIC=3903.005, Time=0.04 sec
 ARIMA(1,0,2)(0,0,0)[0]             : BIC=3903.543, Time=0.09 sec
 ARIMA(0,0,2)(0,0,0)[0]             : BIC=4271.575, Time=0.04 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : BIC=3886.585, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : BIC=3886.856, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : BIC=3882.583, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : BIC=3891.769, Time=0.02 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.337 seconds


# Fit SARIMA model

sarima_model_0 = auto_arima(train_sales, seasonal=True, information_criterion='bic',
                            start_p=0, start_q=0, max_p=1, max_q=2, 
                            stepwise=True, suppress_warnings=True, trace=True, error_action="ignore")

sarima_model_7 = auto_arima(train_sales, seasonal=True, m=7, information_criterion='bic',
                            start_p=0, start_q=0, max_p=1, max_q=2, 
                            stepwise=True, suppress_warnings=True, trace=True, error_action="ignore")

sarima_model_52 = auto_arima(train_sales, seasonal=True, m=52, information_criterion='bic',
                            start_p=0, start_q=0, max_p=1, max_q=2, 
                            stepwise=True, suppress_warnings=True, trace=True, error_action="ignore")

Performing stepwise search to minimize bic
 ARIMA(0,0,0)(0,0,0)[0] intercept   : BIC=3891.769, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : BIC=3882.583, Time=0.02 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : BIC=3886.856, Time=0.03 sec
 ARIMA(0,0,0)(0,0,0)[0]             : BIC=4358.919, Time=0.02 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : BIC=3886.585, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0]             : BIC=inf, Time=0.02 sec

Best model:  ARIMA(1,0,0)(0,0,0)[0] intercept
Total fit time: 0.135 seconds
Performing stepwise search to minimize bic
 ARIMA(0,0,0)(1,0,1)[7] intercept   : BIC=3900.259, Time=0.20 sec
 ARIMA(0,0,0)(0,0,0)[7] intercept   : BIC=3891.769, Time=0.02 sec
 ARIMA(1,0,0)(1,0,0)[7] intercept   : BIC=3887.304, Time=0.10 sec
 ARIMA(0,0,1)(0,0,1)[7] intercept   : BIC=3891.168, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[7]             : BIC=4358.919, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[7] intercept   : BIC=3882.583, Time=0.02 sec
 ARIMA(1,0,0)(0,0,1)[7] intercept   : BIC=3887.281, Time=0.05 sec
 ARIMA(1,0,0)(1,0,1)[7] intercept   : BIC=3892.259, Time=0.46 sec
 ARIMA(1,0,1)(0,0,0)[7] intercept   : BIC=3886.585, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[7] intercept   : BIC=3886.856, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[7]             : BIC=inf, Time=0.01 sec

Best model:  ARIMA(1,0,0)(0,0,0)[7] intercept
Total fit time: 0.971 seconds
Performing stepwise search to minimize bic
 ARIMA(0,0,0)(1,0,1)[52] intercept   : BIC=3869.159, Time=0.39 sec
 ARIMA(0,0,0)(0,0,0)[52] intercept   : BIC=3891.769, Time=0.03 sec
 ARIMA(1,0,0)(1,0,0)[52] intercept   : BIC=3867.127, Time=0.42 sec
 ARIMA(0,0,1)(0,0,1)[52] intercept   : BIC=3863.791, Time=0.31 sec
 ARIMA(0,0,0)(0,0,0)[52]             : BIC=4358.919, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[52] intercept   : BIC=3886.856, Time=0.03 sec
 ARIMA(0,0,1)(1,0,1)[52] intercept   : BIC=3868.364, Time=0.49 sec
 ARIMA(0,0,1)(0,0,2)[52] intercept   : BIC=3863.424, Time=1.57 sec
 ARIMA(0,0,1)(1,0,2)[52] intercept   : BIC=3868.094, Time=3.39 sec
 ARIMA(0,0,0)(0,0,2)[52] intercept   : BIC=3863.269, Time=1.27 sec
 ARIMA(0,0,0)(0,0,1)[52] intercept   : BIC=3864.520, Time=0.25 sec
 ARIMA(0,0,0)(1,0,2)[52] intercept   : BIC=3867.969, Time=2.06 sec
 ARIMA(1,0,0)(0,0,2)[52] intercept   : BIC=3854.434, Time=1.69 sec
 ARIMA(1,0,0)(0,0,1)[52] intercept   : BIC=3855.591, Time=0.31 sec
 ARIMA(1,0,0)(1,0,2)[52] intercept   : BIC=3858.858, Time=2.52 sec
 ARIMA(1,0,0)(1,0,1)[52] intercept   : BIC=3858.043, Time=0.55 sec
 ARIMA(1,0,1)(0,0,2)[52] intercept   : BIC=3859.709, Time=2.18 sec
 ARIMA(1,0,0)(0,0,2)[52]             : BIC=3898.092, Time=1.90 sec

Best model:  ARIMA(1,0,0)(0,0,2)[52] intercept
Total fit time: 19.379 seconds


# Fit ARIMA model: ARIMA(1,0,0)(0,0,0)[0] intercept

arima_model = ARIMA(train_sales, order=(1,0,0), trend='c')
arima_model_fit = arima_model.fit()

c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)


# plot fit to train data

plt.figure(figsize=(16, 8))
plt.plot(train_dates, train_sales, label='Train Sales', color='blue')
plt.plot(train_dates, arima_model_fit.fittedvalues, label='Fitted Values', color='red')
plt.legend()
plt.show()


# predict on test set

arima_pred = arima_model_fit.forecast(len(test_sales), type='levels')

# visualize the prediction
plt.figure(figsize=(16, 8))
plt.plot(test_dates, test_sales.values, label='Test Sales', color='green')
plt.plot(test_dates, arima_pred, label='Predicted Sales', color='red')
plt.xlabel('Date')  # Label for x-axis
plt.ylabel('Sales')  # Label for y-axis
plt.title('Test Sales vs. Predicted Sales')
plt.legend()
plt.show()


# Fit SARIMA Model: (S)ARIMA(1,0,0)(0,0,2)[52] intercept

sarima_model = ARIMA(train_sales, order=(1,0,0), seasonal_order=(0,0,2,52), trend='c')
sarima_model_fit = sarima_model.fit()

c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:866: UserWarning: Too few observations to estimate starting parameters for seasonal ARMA. All parameters except for variances will be set to zeros.
  warn('Too few observations to estimate starting parameters%s.'


# plot fit to train data

plt.figure(figsize=(16, 8))
plt.plot(train_dates, train_sales, label='Train Sales', color='blue')
plt.plot(train_dates, sarima_model_fit.fittedvalues, label='Fitted Values', color='red')
plt.legend()
plt.show()


# predict on test set

sarima_pred = sarima_model_fit.forecast(len(test_sales), type='levels')

# visualize the prediction
plt.figure(figsize=(16, 8))
plt.plot(test_dates, test_sales.values, label='Test Sales', color='green')
plt.plot(test_dates, sarima_pred, label='Predicted Sales', color='red')
plt.xlabel('Date')  # Label for x-axis
plt.ylabel('Sales')  # Label for y-axis
plt.title('Test Sales vs. Predicted Sales')
plt.legend()
plt.show()


# calculate the mean squared error, rmse, mape for ARIMA and SARIMA

mse_arima = mean_squared_error(test_sales, arima_pred)
rmse_arima = sqrt(mse_arima)
mape_arima = mean_absolute_percentage_error(test_sales, arima_pred)*100

mse_sarima = mean_squared_error(test_sales, sarima_pred)
rmse_sarima = sqrt(mse_sarima)
mape_sarima = mean_absolute_percentage_error(test_sales, sarima_pred)*100

print("ARIMA: RMSE = ", round(rmse_arima,2), "MAPE = ", round(mape_arima,2))
print("SARIMA: RMSE = ", round(rmse_sarima,2), "MAPE = ", round(mape_sarima,2))

ARIMA: RMSE =  1924110.71 MAPE =  3.19
SARIMA: RMSE =  1525706.06 MAPE =  2.47


# check whether holiday for all the same for reasons of data quality

holiday = data_raw.groupby('Date')['IsHoliday'].nunique()
print(max(holiday))


# make isholiday usable for the model
isholiday = data_raw.groupby('Date')['IsHoliday'].max()
isholiday

1

Date
2010-02-05    0
2010-02-12    1
2010-02-19    0
2010-02-26    0
2010-03-05    0
             ..
2012-09-28    0
2012-10-05    0
2012-10-12    0
2012-10-19    0
2012-10-26    0
Name: IsHoliday, Length: 143, dtype: int64


# also use the average temperature for the model

temperature = data_raw.groupby('Date')['Temperature'].mean()
temperature

# add CPI and Unemployment to the model

cpi = data_raw.groupby('Date')['CPI'].mean()

unemployment = data_raw.groupby('Date')['Unemployment'].mean()

# for all objects, split the data as before in 80/20 split (train test)

train_fuel_price = fuel_price[:int(0.8*(len(fuel_price)))]
test_fuel_price = fuel_price[int(0.8*(len(fuel_price))):]

train_cpi = cpi[:int(0.8*(len(cpi)))]
test_cpi = cpi[int(0.8*(len(cpi))):]

train_unemployment = unemployment[:int(0.8*(len(unemployment)))]
test_unemployment = unemployment[int(0.8*(len(unemployment))):]

train_isholiday = isholiday[:int(0.8*(len(isholiday)))]
test_isholiday = isholiday[int(0.8*(len(isholiday))):]

train_temperature = temperature[:int(0.8*(len(temperature)))]
test_temperature = temperature[int(0.8*(len(temperature))):]
                               
# check lengths of all objects compared to train_sales and test_sales

print(len(train_fuel_price))
print(len(test_fuel_price))

print(len(train_cpi))
print(len(test_cpi))

print(len(train_unemployment))
print(len(test_unemployment))

print(len(train_isholiday))
print(len(test_isholiday))

print(len(train_temperature))
print(len(test_temperature))

114
29
114
29
114
29
114
29
114
29


# Create SARIMAX model with exogenous variables

# Merge the exogenous variables into a single dataframe with the train sales data

exog_train = pd.concat([train_cpi, train_fuel_price, train_unemployment, train_isholiday, train_temperature], axis=1)
exog_train.columns = ['CPI', 'Fuel_Price', 'Unemployment', 'IsHoliday', 'Temperature']
print(exog_train)
# Fit SARIMAX model

# sarimax_model = ARIMA(train_sales, order=(1,0,0), seasonal_order=(0,0,2,52), exog=exog_train, trend='c')

                   CPI  Fuel_Price  Unemployment  IsHoliday  Temperature
Date                                                                    
2010-02-05  167.398405    2.717869      8.576731          0    33.277942
2010-02-12  167.384138    2.696102      8.567309          1    33.361810
2010-02-19  167.338966    2.673666      8.576351          0    37.038310
2010-02-26  167.691019    2.685642      8.561375          0    38.629563
2010-03-05  167.727351    2.731816      8.572689          0    42.373998
...                ...         ...           ...        ...          ...
2012-03-09  174.610303    3.751709      7.486587          0    47.016190
2012-03-16  174.857732    3.824211      7.486935          0    54.330752
2012-03-23  174.956546    3.865280      7.482098          0    56.960807
2012-03-30  174.871741    3.930751      7.485908          0    56.975768
2012-04-06  174.910693    3.963752      7.399025          0    56.407154

[114 rows x 5 columns]


# Fit the SARIMA model (S)ARIMA(1,0,0)(0,0,2)[52] intercept with exogenous variables

sarimax_model = ARIMA(train_sales, order=(1,0,0), seasonal_order=(0,0,2,52), exog=exog_train, trend='c')
sarimax_model_fit = sarimax_model.fit()

c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency W-FRI will be used.
  self._init_dates(dates, freq)
c:\Users\felix\anaconda3\Lib\site-packages\statsmodels\tsa\statespace\sarimax.py:866: UserWarning: Too few observations to estimate starting parameters for seasonal ARMA. All parameters except for variances will be set to zeros.
  warn('Too few observations to estimate starting parameters%s.'


# predict on test set

sarimax_model_pred = sarimax_model_fit.forecast(len(test_sales), exog=pd.concat([test_cpi, test_fuel_price, test_unemployment, test_isholiday, test_temperature], axis=1))

# visualize the prediction

plt.figure(figsize=(16, 8))
plt.plot(test_dates, test_sales.values, label='Test Sales', color='green')
plt.plot(test_dates, sarimax_model_pred, label='Predicted Sales', color='red')
plt.xlabel('Date')  # Label for x-axis
plt.ylabel('Sales')  # Label for y-axis
plt.title('Test Sales vs. Predicted Sales')
plt.legend()
plt.show()


# calculate the mean squared error, rmse, mape for SARIMAX

mse_sarimax = mean_squared_error(test_sales, sarimax_model_pred)
rmse_sarimax = sqrt(mse_sarimax)
mape_sarimax = mean_absolute_percentage_error(test_sales, sarimax_model_pred)*100

print("SARIMAX: RMSE = ", round(rmse_sarimax,2), "MAPE = ", round(mape_sarimax,2))

SARIMAX: RMSE =  2653716.59 MAPE =  4.65


# Create graphic that compares the performance of linear regression, Holt-Winters, SARIMA

models = ['Linear Regression', 'Holt-Winters', 'SARIMA']

rmse = [rmse_lm, rmse_hw, rmse_sarima]

mape = [mape_lm, mape_hw, mape_sarima]

# Colors for each model
model_colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

# Positions for the ticks on the x-axis
metrics = ['RMSE', 'MAPE']
x_positions = np.arange(len(metrics))  # One position for each metric
bar_width = 0.1  # Width of each bar

# Create figure and axes
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot RMSE on the primary y-axis
for i, model in enumerate(models):
    ax1.bar(x_positions[0] - bar_width / 2 + i * bar_width, rmse[i], width=bar_width, color=model_colors[i], label=model)

# Create a secondary y-axis and plot MAPE on it
ax2 = ax1.twinx()
for i, model in enumerate(models):
    ax2.bar(x_positions[1] - bar_width / 2 + i * bar_width, mape[i], width=bar_width, color=model_colors[i], label=model)

# Set the x-ticks to be the metric names (RMSE and MAPE)
ax1.set_xticks(x_positions)
ax1.set_xticklabels(metrics)

# Add a legend
ax1.legend(loc='upper left', bbox_to_anchor=(1, 1))

# Add labels
ax1.set_ylabel('RMSE')
ax2.set_ylabel('MAPE')

# Add a title
plt.title('Model Performance Comparison')

# Show the plot
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()

	Store	IsHoliday	Dept	Weekly_Sales	Temperature	Fuel_Price	MarkDown1	MarkDown2	MarkDown3	MarkDown4	MarkDown5	CPI	Unemployment	Type	Size
count	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000	421570.000000
mean	22.200546	0.070358	44.260317	15981.258123	60.090059	3.361027	2590.074819	879.974298	468.087665	1083.132268	1662.772385	171.201947	7.960289	2.410088	136727.915739
std	12.785297	0.255750	30.492054	22711.183519	18.447931	0.458515	6052.385934	5084.538801	5528.873453	3894.529945	4207.629321	39.159276	1.863296	0.666337	60980.583328
min	1.000000	0.000000	1.000000	-4988.940000	-2.060000	2.472000	0.000000	-265.760000	-29.100000	0.000000	0.000000	126.064000	3.879000	1.000000	34875.000000
25%	11.000000	0.000000	18.000000	2079.650000	46.680000	2.933000	0.000000	0.000000	0.000000	0.000000	0.000000	132.022667	6.891000	2.000000	93638.000000
50%	22.000000	0.000000	37.000000	7612.030000	62.090000	3.452000	0.000000	0.000000	0.000000	0.000000	0.000000	182.318780	7.866000	3.000000	140167.000000
75%	33.000000	0.000000	74.000000	20205.852500	74.280000	3.738000	2809.050000	2.200000	4.540000	425.290000	2168.040000	212.416993	8.572000	3.000000	202505.000000
max	45.000000	1.000000	99.000000	693099.360000	100.140000	4.468000	88646.760000	104519.540000	141630.610000	67474.850000	108519.280000	227.232807	14.313000	3.000000	219622.000000

Predicting Walmart Store Sales¶

Data Exploration¶

For simplicity reasons, the target variable (weekly sales) will be predicted aggregated over the different stores.¶

Split the data¶

Perform `Linear Regression`, `Holt Winter's Exponential Smoothing` & `(S)ARIMA(X)` to Predict Sales¶

1. Linear Regression¶

Holt Winters Exponential Smoothing¶

(S)ARIMA(X) models¶

1. ARIMA performance¶

2) Seasonal Arima Model (SARIMA)¶

SARIMAX: Boosting model performance through exogenous variables¶

Results from (S)ARIMA(X)¶

Overall results¶

Best Predictive Model: SARIMA¶

	Store	Date	Dept	Weekly_Sales	Temperature	Fuel_Price	CPI	Unemployment	Type	Size
0	1	2010-02-05	1.0	24924.50	42.31	2.572	211.096358	8.106	3	151315
1	1	2010-02-05	26.0	11737.12	42.31	2.572	211.096358	8.106	3	151315
2	1	2010-02-05	17.0	13223.76	42.31	2.572	211.096358	8.106	3	151315
3	1	2010-02-05	45.0	37.44	42.31	2.572	211.096358	8.106	3	151315
4	1	2010-02-05	28.0	1085.29	42.31	2.572	211.096358	8.106	3	151315

Predicting Walmart Store Sales¶

Data Exploration¶

For simplicity reasons, the target variable (weekly sales) will be predicted aggregated over the different stores.¶

Split the data¶

Perform Linear Regression, Holt Winter's Exponential Smoothing & (S)ARIMA(X) to Predict Sales¶

1. Linear Regression¶

Holt Winters Exponential Smoothing¶

(S)ARIMA(X) models¶

1. ARIMA performance¶

2) Seasonal Arima Model (SARIMA)¶

SARIMAX: Boosting model performance through exogenous variables¶

Results from (S)ARIMA(X)¶

Overall results¶

Best Predictive Model: SARIMA¶

Perform `Linear Regression`, `Holt Winter's Exponential Smoothing` & `(S)ARIMA(X)` to Predict Sales¶