import pandas as pd
import numpy as np
import pytimetk as tk
from sklearn.ensemble import RandomForestRegressor
df_stock_byday.glimpse()
dset = tk.load_dataset('walmart_sales_weekly', parse_dates = ['Date'])
dset = dset.drop(columns=[
'id', # This column can be removed as it is equivalent to 'Dept'
'Store', # This column has only one possible value
'Type', # This column has only one possible value
'Size', # This column has only one possible value
'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI',
'Unemployment'])
dset.head()
sales_df = dset
sales_df_with_futureframe = sales_df \
.groupby('Dept') \
.future_frame(
date_column = 'Date',
length_out = 5
)
sales_df_dates = sales_df_with_futureframe.augment_timeseries_signature(date_column = 'Date')
sales_df_dates.head(10)
df_with_lags = sales_df_dates \
.groupby('Dept') \
.augment_lags(
date_column = 'Date',
value_column = 'Weekly_Sales',
lags = [5,6,7,8,9]
)
lag_columns = [col for col in df_with_lags.columns if 'lag' in col]
df_with_rolling = df_with_lags \
.groupby('Dept') \
.augment_rolling(
date_column = 'Date',
value_column = lag_columns,
window = 4,
window_func = 'mean',
threads = 1 # Change to -1 to use all available cores
)
df_with_rolling[df_with_rolling.Dept ==1].head(10)
df_with_lags.head(5)
all_lag_columns = [col for col in df_with_rolling.columns if 'lag' in col]
df_no_nas = df_with_rolling \
.dropna(subset=all_lag_columns, inplace=False)
df_no_nas.head()
future = df_no_nas[df_no_nas.Weekly_Sales.isnull()]
train = df_no_nas[df_no_nas.Weekly_Sales.notnull()]
train_columns = [
'Dept'
, 'Date_year'
, 'Date_month'
, 'Date_yweek'
, 'Date_mweek'
, 'Weekly_Sales_lag_5'
, 'Weekly_Sales_lag_6'
, 'Weekly_Sales_lag_7'
, 'Weekly_Sales_lag_8'
, 'Weekly_Sales_lag_5_rolling_mean_win_4'
, 'Weekly_Sales_lag_6_rolling_mean_win_4'
, 'Weekly_Sales_lag_7_rolling_mean_win_4'
, 'Weekly_Sales_lag_8_rolling_mean_win_4'
]
X = train[train_columns]
y = train[['Weekly_Sales']]
model = RandomForestRegressor(random_state=123)
model = model.fit(X, y)
predicted_values = model.predict(future[train_columns])
future['y_pred'] = predicted_values
future.head(10)
train['type'] = 'actuals'
future['type'] = 'prediction'
full_df = pd.concat([train, future])
full_df.head(10)
full_df['Weekly_Sales'] = np.where(full_df.type =='actuals', full_df.Weekly_Sales, full_df.y_pred)
full_df \
.groupby('Dept') \
.plot_timeseries(
date_column = 'Date',
value_column = 'Weekly_Sales',
color_column = 'type',
smooth = False,
smooth_alpha = 0,
facet_ncol = 2,
facet_scales = "free",
y_intercept_color = tk.palette_timetk()['steel_blue'],
width = 800,
height = 600,
engine = 'plotly'
)