import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#pip install -U scikit-learn scipy
from sklearn.linear_model import LinearRegression
Data
Python Enviroment Version
I’ve created conda enviroment Uninstalled conda. use Python 3.11.py10
for this running the python 3.10.4. Always use python --version
to check if you are on py10. This should have package pyeath installed.
Setting Figures
= {'color': '0.75',
plot_params 'style': '.-',
'markeredgecolor': '0.25',
'markerfacecolor': '0.25',
'legend': False}
'seaborn-whitegrid')
plt.style.use(
plt.rc("figure",
=True,
autolayout=(11, 4),
figsize=18,
titlesize='bold',
titleweight
)
plt.rc("axes",
="bold",
labelweight="large",
labelsize="bold",
titleweight=16,
titlesize=10,
titlepad
)%config InlineBackend.figure_format = 'retina'
#pip install kaggle
#!kaggle kernels output ryanholbrook/linear-regression-with-time-series -p data
= pd.read_csv('data/store-sales-time-series-forecasting/train.csv',
df ='date',
index_col=['date']
parse_dates
)
df.info()type(df.index)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3000888 entries, 2013-01-01 to 2017-08-15
Data columns (total 5 columns):
# Column Dtype
--- ------ -----
0 id int64
1 store_nbr int64
2 family object
3 sales float64
4 onpromotion int64
dtypes: float64(1), int64(3), object(1)
memory usage: 137.4+ MB
pandas.core.indexes.datetimes.DatetimeIndex
Time Step Feature
Simply index time as x. Day 1, Day 2 ect
# Engineer a time step feature
import numpy as np
= df.groupby('date')[['sales']].sum()
gross_sale 'Time']=np.arange(len(gross_sale.index))
gross_sale[3) gross_sale.head(
sales | Time | |
---|---|---|
date | ||
2013-01-01 | 2511.618999 | 0 |
2013-01-02 | 496092.417944 | 1 |
2013-01-03 | 361461.231124 | 2 |
= plt.subplots()
fig, ax # scatter plot
'Time', 'sales', data=gross_sale, color='0.75')
ax.plot(# regression plot
= sns.regplot(x='Time', y='sales', data=gross_sale, ci=None, scatter_kws=dict(color='0.25'))
ax 'Time Plot of Gross Sales'); ax.set_title(
Advantage of a timestep feature instead of just time is it scales verywell. So you don’t have to worry too much about does it matter we measure in date units or in month units.
Lag Features
This one is about use previous value to predict future recent value
'Lag_1'] = gross_sale['sales'].shift(1)
gross_sale[= plt.subplots()
fig, ax = sns.regplot(x='Lag_1', y='sales', data=gross_sale, ci=None, scatter_kws=dict(color='0.25'))
ax 'equal')
ax.set_aspect('Lag Plot of Hardcover Sales') ax.set_title(
Text(0.5, 1.0, 'Lag Plot of Hardcover Sales')
serial dependence high sales on one day usually means hig sales on the next day
= gross_sale.loc[:, ['Time']]
X = gross_sale.loc[:, 'sales']
y print('X looks like this: '), print(X.head(3)), print('...'), print(f"X is {type(X)}")
print('y looks like this: '), print(y.head(3)), print('...'), print(f"y is {type(y)}")
# HINT: do you know now why when fit X they prefer use the higher X
X looks like this:
Time
date
2013-01-01 0
2013-01-02 1
2013-01-03 2
...
X is <class 'pandas.core.frame.DataFrame'>
y looks like this:
date
2013-01-01 2511.618999
2013-01-02 496092.417944
2013-01-03 361461.231124
Name: sales, dtype: float64
...
y is <class 'pandas.core.series.Series'>
(None, None, None, None)
# fit a model be like:
= gross_sale.loc[:, ['Time']]
X = gross_sale.loc[:, 'sales']
y = LinearRegression()
model
model.fit(X, y)= pd.Series(model.predict(X), index = X.index) pred_y
Plus: Multi-assign and Plot several plot
# python hack about multi-assign
= [1,2]
a, b = [1, (2, 3)] a, (b, c)
Trend
Data
= pd.read_csv('data/store-sales-time-series-forecasting/train.csv',
df = 'date',
index_col = ['date']
parse_dates 'D')
).to_period(= df.groupby('date')['sales'].mean() average_sales
average_sales.head()
date
2013-01-01 1.409438
2013-01-02 278.390807
2013-01-03 202.840197
2013-01-04 198.911154
2013-01-05 267.873244
Freq: D, Name: sales, dtype: float64
Moving Average
Moving average is the idea of observe overage value within a window of time frame. But instead of those windows being mutually exclusive, those windows roll on.
len(average_sales.loc['2013'])
364
In the gross_sale data, one enclose cycle is one years. So window should be set as 360. minimum periods is typically half this window (not sure why)
= average_sales.rolling(
moving_average = 364,
window = True,
center =183
min_periods
).mean()= average_sales.plot(style = '.', color = '0.5')
ax
moving_average.plot(= ax,
ax = 3,
linewidth = 'Ploting Moving Average'
title )
3), gross_sale.sample(3) moving_average.sample(
(date
2015-12-20 437.490098
2014-04-11 288.010483
2015-12-17 436.879765
Freq: D, Name: sales, dtype: float64,
sales Time Lag_1
date
2016-05-25 6.375120e+05 1237 606377.205216
2015-03-24 4.073697e+05 810 462664.237004
2016-04-02 1.150825e+06 1184 872467.320075)
Deterministic Model
which is a fancy term for linear regression with time series. This is also a linear model. To make this model work requries you to converge time index to_period
index.
This process is similar to linear model. Instead fitting x, y, z three independ varaibles, you are fitting three x of different ‘orders’.
\[ y = a + b\,x + c\,x^2 + d\,x^3 + ... + n\,x^{n} \]
(Somewhat reminds me of Taylor’s series. Maybe that’s what order mans) What’s interesting about this function is odd quatric functions can vary ups and downs, so to fit into any shape you like.
#pip install statsmodels
from statsmodels.tsa.deterministic import DeterministicProcess
= average_sales.copy()
y = DeterministicProcess(
dp =y.index,
index= True, # dummy features
constant = 3, # time dummy trend, 1 is linear, 2 is quadratic, 3 cubic
order = True
drop
)= dp.in_sample()
X X.tail()
const | trend | trend_squared | trend_cubed | |
---|---|---|---|---|
date | ||||
2017-08-11 | 1.0 | 1680.0 | 2822400.0 | 4.741632e+09 |
2017-08-12 | 1.0 | 1681.0 | 2825761.0 | 4.750104e+09 |
2017-08-13 | 1.0 | 1682.0 | 2829124.0 | 4.758587e+09 |
2017-08-14 | 1.0 | 1683.0 | 2832489.0 | 4.767079e+09 |
2017-08-15 | 1.0 | 1684.0 | 2835856.0 | 4.775582e+09 |
= dp.out_of_sample(steps = 90)
X_fore X_fore.head()
const | trend | trend_squared | trend_cubed | |
---|---|---|---|---|
2017-08-16 | 1.0 | 1685.0 | 2839225.0 | 4.784094e+09 |
2017-08-17 | 1.0 | 1686.0 | 2842596.0 | 4.792617e+09 |
2017-08-18 | 1.0 | 1687.0 | 2845969.0 | 4.801150e+09 |
2017-08-19 | 1.0 | 1688.0 | 2849344.0 | 4.809693e+09 |
2017-08-20 | 1.0 | 1689.0 | 2852721.0 | 4.818246e+09 |
from sklearn.linear_model import LinearRegression
= average_sales
y = LinearRegression(fit_intercept = False)
model
model.fit(X, y)= pd.Series(model.predict(X), index = X.index)
y_pred = pd.Series(model.predict(X_fore), index = X_fore.index) y_fore
= average_sales.plot(
ax = '.', color = '0.5', title = 'average sales'
style
)= y_pred.plot(ax= ax, linewidth=3, label='Trend') # underscore is for temporary variable
ax = y_fore.plot(ax = ax, linewidth=3, label="Trend Forcast", color = 'C3') ax
Risks of Highorder Ploynomials
Due to the property of function > An order 11 polynomial will include terms like t ** 11. Terms like these tend to diverge rapidly outside of the training period making forecasts very unreliable.
= DeterministicProcess(
dp =y.index,
index= 11, # time dummy trend, 1 is linear, 2 is quadratic, 3 cubic
order
)= dp.in_sample()
X
= LinearRegression(fit_intercept = True)
model
model.fit(X, y)
= dp.out_of_sample(steps=90)
X_fore = pd.Series(model.predict(X), index=X.index)
y_pred = pd.Series(model.predict(X_fore), index = X_fore.index)
y_fore
= y.plot(style = '.', alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, linewidth=3, label="Trend", color='C0')
ax = y_fore.plot(ax=ax, linewidth=3, label="Trend Forecast", color='C3')
ax ; ax.legend()
Fit Trend with Spines
Multivariate Adaptive Regression Splines (MARS) > Splines are a nice alternative to polynomials when you want to fit a trend. The Multivariate Adaptive Regression Splines (MARS) algorithm in the pyearth library is powerful and easy to use. There are a lot of hyperparameters you may want to investigate.
This use Earth()
model in pyearth package by Stephen Milborrow, the originsl is R version. API here You will need to install via conda. Use the one under scikit-learn.
conda install pip install sklearn-contrib-py-earth
#from pyearth import Earth
try:
= average_sales.copy()
y = DeterministicProcess(index=y.index, order=1)
dp = dp.in_sample()
X
# Fit a MARS model with `Earth`
= Earth()
model
model.fit(X, y)
= pd.Series(model.predict(X), index=X.index)
y_pred
= y.plot(#**plot_params,
ax ="Average Sales", ylabel="items sold")
title= y_pred.plot(ax=ax, linewidth=3, label="Trend")
ax except:
print('This code will no execute until pyearth is installed')
This code will no execute until pyearth is installed
Seasonality
You already know sine and cosine functions are used to model these. Terms are called Fourtier features.
Fourier Features
1 pair of fourier features are: \[ k = 2 \pi \frac{t}{f} \\ f(j) = \beta_1 \sin(j * k) + \beta_2 \cos(j * k) \]
\(n\) order(s) of fourier features: \[ F(n) = \sum_{j=1}^{n} f(j) \]
- \(k\) is time scaled to frequency
- \(i\) is order of features
- \(\beta_1\) and \(\beta_2\) is what you throw into linear regression
The advantage of a fourier pair is so that two parameters are at the same scale.
# create fourier feature for linear regression to figure out
def fourier_features(index, freq, order):
= np.arange(len(index), dtype=np.float32)
time = 2 * np.pi * (1 / freq) * time
k = {}
features for i in range(1, order + 1):
features.update({f"sin_{freq}_{i}": np.sin(i * k),
f"cos_{freq}_{i}": np.cos(i * k),
})return pd.DataFrame(features, index=index)
# this transform it into something you are easy to fits into linear regression
Periodogram
Given frequency y = $ {2}$ \(\beta_1\), \(\beta_2\) is the coefficients of sine and cosine.
A useful trigonometric identity is is: \[ A \cos(2 \pi \omega t + \phi) = \beta_1 \cos(2 \pi \omega t) + \beta_2 \sin(2 \pi \omega t) \\ \beta_1 = A \cos(\phi) \\ \beta_2 = - A \sin(\phi) \\ 2A^2 = \beta_1^2 + \beta_2^2 \] The whole time series is represented as: \[ x_t = \sum_{j = 1}^{n/2} [ \beta_1(\frac{j}{n}) cos(2 \pi \omega_j t)+ \beta_2(\frac{j}{n}) sin(2 \pi \omega_j t) ] \] In periodogram given \(\frac{j} {n}\) frequency: \[ P(\frac{j} {n}) = \beta_1^2 (\frac{j} {n}) + \beta_2^2(\frac{j}{n}) \]
A relatively large value of P(j/n) indicates relatively more importance for the frequency j/n (or near j/n) in explaining the oscillation in the observed series. P(j/n) is proportional to the squared correlation between the observed series and a cosine wave with frequency j/n. The dominant frequencies might be used to fit cosine (or sine) waves to the data, or might be used simply to describe the important periodicities in the series.
source: PennState Eberly College of Science
- Estimate \(\beta_1\) and \(\beta_2\) is two of n parameters
- They are not neccessary estimated by regression but this math device called Fast Fourier Transformation (FFT)
(Fast) Fourier Transformation
I think of it this way. Think you can fit a n sum of fontier pairs together. Your frequency is then coposed of n possible pair of fourier pairs. The higher order, the higher the freqency (lower the wave length). Some fourier will be more dominant than the other. When a fourier pair is not dominant, their sum of \(\beta\) square may as well be 0. This means it cancels them out. So if you slice the equation by fourier pairs. Each pair will represent strenght of that wave function. With 0 indicate very low effect. Higher value indicate more dominate effect.
dig further: what is fourier transform
Custom Functions
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
def seasonal_plot(X, y, period, freq, ax=None):
if ax is None:
= plt.subplots()
_, ax = sns.color_palette("husl", n_colors=X[period].nunique(),)
palette = sns.lineplot(
ax =freq,
x=y,
y=period,
hue=X,
data=False,
ci=ax,
ax=palette,
palette=False,
legend
)f"Seasonal Plot ({period}/{freq})")
ax.set_title(for line, name in zip(ax.lines, X[period].unique()):
= line.get_ydata()[-1]
y_
ax.annotate(
name,=(1, y_),
xy=(6, 0),
xytext=line.get_color(),
color=ax.get_yaxis_transform(),
xycoords="offset points",
textcoords=14,
size="center",
va
)return ax
def plot_periodogram(ts, detrend='linear', ax=None):
from scipy.signal import periodogram
= pd.Timedelta("1Y") / pd.Timedelta("1D")
fs = periodogram( # this code do not generate graph it creates two vectors
freqencies, spectrum
ts,=fs,
fs=detrend,
detrend="boxcar",
window='spectrum',
scaling
)if ax is None:
= plt.subplots()
_, ax ="purple")
ax.step(freqencies, spectrum, color"log")
ax.set_xscale(1, 2, 4, 6, 12, 26, 52, 104])
ax.set_xticks([
ax.set_xticklabels(
["Annual (1)",
"Semiannual (2)",
"Quarterly (4)",
"Bimonthly (6)",
"Monthly (12)",
"Biweekly (26)",
"Weekly (52)",
"Semiweekly (104)",
],=30,
rotation
)="y", style="sci", scilimits=(0, 0))
ax.ticklabel_format(axis"Variance")
ax.set_ylabel("Periodogram")
ax.set_title(return ax
Example
= average_sales.squeeze().loc['2017']
average_sales_2017 = average_sales_2017.to_frame()
X 'week'] = X.index.week
X['day'] = X.index.dayofweek
X[
seasonal_plot(X, 'sales',
= 'week',
period = 'day')
freq plot_periodogram(average_sales_2017)
NOTE: This set of features happens to have four spikes and the four spkes happens to be consecutive
Note x is completely different scale to what’s introduce in PenState’s text book (where frequency is represented as a fraction). Here frequency is represented as the inverse whole period with low frequency left and high frequency right.
This graph tells you strong weekly seasonality. Because you want to reduce chances of over fitting. You want to try reduce the fourier sets. You can model these ways:
- 12 month frequency (30/365) with 4 fourier pairs
- 1 anual frequency (364/365) with 26 fourier paris (there will be a lot of frequency here)
Set frequency as how you want period to return. As order increase it captures the intrinsic orders of the frequency.
So in the one the first fourier feature is observed monthly hense the set frequency to month. There are about four additionaly waves (they all happens to be twice as frequent as the other one). Hense we set four fourier pairs.
Compute Fourier Feature (statmodels
)
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
= CalendarFourier(freq="M", order=4) # here parameter derived from periodogram
fourier = DeterministicProcess(
dp =average_sales_2017.index,
index=True, # dummy feature for bias (y-intercept)
constant=1, # trend (order 1 means linear)
order=True, # weekly seasonality (indicators)
seasonal=[fourier], # annual seasonality (fourier)
additional_terms=True, # drop terms to avoid collinearity
drop
)= dp.in_sample()
X # note fourier needs to be made as a list for it to be literatable
= average_sales_2017
y = LinearRegression(fit_intercept=False)
model = model.fit(X, y)
_ = pd.Series(model.predict(X), index=y.index)
y_pred = dp.out_of_sample(steps=90)
X_fore = pd.Series(model.predict(X_fore), index=X_fore.index)
y_fore = y.plot(color='0.25', style='.-', alpha = 0.25,title="Store Average Sales")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax = y_fore.plot(ax=ax, label="Seasonal Forecast", color='C3')
ax = ax.legend(); _
#X.to_csv('data/output/Calenrier_output.csv')
Detrend or deseasonalising
Verify that we are not modeling random variance
= y - y_pred
y_deseason
= plt.subplots(3, 1, sharex=True, sharey=True, figsize=(10, 7))
fig, (ax1, ax2, ax3) = plot_periodogram(y, ax=ax1)
ax1 "Product Sales Frequency Components")
ax1.set_title(= plot_periodogram(y_deseason, ax=ax2)
ax2 "Deseasonalized")
ax2.set_title(= plot_periodogram(y, ax=ax3)
ax3 #ax.axes.set_facecolor('blue')
= plot_periodogram(y_deseason, ax=ax3)
ax3 0].set_color('blue')
ax3.axes.lines["Product Sales Frequency Components"); # take this out you would be creating a new plot ax3.set_title(
This plot shows that our model ahs surverred very well in explaining seasonality variance.
Holiday (Special Events)
You can fit spacial events by creating dummy variables (here it is convinent because we are only useing one years to train)
= pd.read_csv(
holidays_events 'data/store-sales-time-series-forecasting/holidays_events.csv',
= 'date',
index_col ={
dtype'type': 'category',
'locale': 'category',
'locale_name': 'category',
'description': 'category',
'transferred': 'bool',
},= ['date'],
parse_dates =True
infer_datetime_format'D')
).to_period(#type(holidays_events.index)
= (
holidays
holidays_events"locale in ['National', 'Regional']")
.query('2017':'2017-08-15', ['description']]
.loc[=lambda x: x.description.cat.remove_unused_categories())
.assign(description
) display(holidays)
description | |
---|---|
date | |
2017-01-01 | Primer dia del ano |
2017-01-02 | Traslado Primer dia del ano |
2017-02-27 | Carnaval |
2017-02-28 | Carnaval |
2017-04-01 | Provincializacion de Cotopaxi |
2017-04-14 | Viernes Santo |
2017-05-01 | Dia del Trabajo |
2017-05-13 | Dia de la Madre-1 |
2017-05-14 | Dia de la Madre |
2017-05-24 | Batalla de Pichincha |
2017-05-26 | Traslado Batalla de Pichincha |
2017-06-25 | Provincializacion de Imbabura |
2017-08-10 | Primer Grito de Independencia |
2017-08-11 | Traslado Primer Grito de Independencia |
= y_deseason.plot(**plot_params)
ax ='C3')
plt.plot_date(holidays.index, y_deseason[holidays.index], color'National and Regional Holidays'); ax.set_title(
= pd.get_dummies(
X_holidays
holidays,= ['description']
columns
)= X.join(X_holidays, on='date').fillna(0.0)
X2 = LinearRegression().fit(X2, y)
model = pd.Series(
y_pred
model.predict(X2),= X2.index,
index = 'Fitted',
name )
3) X_holidays.sample(
description_Batalla de Pichincha | description_Carnaval | description_Dia de la Madre | description_Dia de la Madre-1 | description_Dia del Trabajo | description_Primer Grito de Independencia | description_Primer dia del ano | description_Provincializacion de Cotopaxi | description_Provincializacion de Imbabura | description_Traslado Batalla de Pichincha | description_Traslado Primer Grito de Independencia | description_Traslado Primer dia del ano | description_Viernes Santo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | |||||||||||||
2017-01-01 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2017-08-10 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2017-08-11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
= y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax ; ax.legend()
= y.plot(color='0.25', style='.-', alpha = 0.25,title="Store Average Sales")
ax = y_pred.plot(ax=ax, label="Seasonal")
ax = y_fore.plot(ax=ax, label="Seasonal Forecast", color='C3')
ax = plt.plot_date(holidays.index, y[holidays.index], color = 'C3'); ax
Time Series as Features
Partial Autocorrelcation
from statsmodels.graphics.tsaplots import plot_pacf
; plot_pacf(average_sales_2017)
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/statsmodels/graphics/tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
warnings.warn(
Cycle
Hybrid Models
Essentially all above combined:
series = trend + seasons + cycles + error
Regression algorithmn: * transform target: * for example decision tree. * group target value in training and make prediction of feature by averaging values in a group * transform features: * for example polynormial function. Use mathmatical function. * featues as input combines and transform
Feature Transformer Extrapolate Target Value (think of this as a point inbetween two discrete value amongst a function) beyound bondary of training set. Same cannot be said for Decision Tree. Random Forest and Gradient boosted decision tree takes the last step.
#kaggle recommend: 1. linear regression for extrapolate trend 2. transform target to remove trend 3. apply XGBoost to detrended residual
linbear regression + XGBosst
from pathlib import Path
from warnings import simplefilter
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from xgboost import XGBRegressor
"ignore")
simplefilter(
# Set Matplotlib defaults
"seaborn-whitegrid")
plt.style.use(
plt.rc("figure",
=True,
autolayout=(11, 4),
figsize=18,
titlesize='bold',
titleweight
)
plt.rc("axes",
="bold",
labelweight="large",
labelsize="bold",
titleweight=16,
titlesize=10,
titlepad
)= dict(
plot_params ="0.75",
color=".-",
style="0.25",
markeredgecolor="0.25",
markerfacecolor
)
= Path("data/ts-course/")
data_dir = ["BuildingMaterials", "FoodAndBeverage"]
industries = pd.read_csv(
retail / "us-retail-sales.csv",
data_dir =['Month'] + industries,
usecols=['Month'],
parse_dates='Month',
index_col'D').reindex(columns=industries)
).to_period(= pd.concat({'Sales': retail}, names=[None, 'Industries'], axis=1) # this is a hayer of hierarachical index
retail
retail.head()
Sales | ||
---|---|---|
Industries | BuildingMaterials | FoodAndBeverage |
Month | ||
1992-01-01 | 8964 | 29589 |
1992-02-01 | 9023 | 28570 |
1992-03-01 | 10608 | 29682 |
1992-04-01 | 11630 | 30228 |
1992-05-01 | 12327 | 31677 |
Tips: now retail is hierarchically indexed you can only have to access a top layer column before you can access a bottom layer.
= retail.copy()
y = DeterministicProcess(
dp =y.index,
index=True,
constant=2,
order=True
drop
)= dp.in_sample() # features for training data
X
= train_test_split(
idx_train, idx_test =12 * 4, shuffle=False,
y.index, test_size
)= X.loc[idx_train, :], X.loc[idx_test, :]
X_train, X_test = y.loc[idx_train], y.loc[idx_test]
y_train, y_test
# Fit trend model
= LinearRegression(fit_intercept=False)
model
model.fit(X_train, y_train)
# Make predictions
= pd.DataFrame(
y_fit
model.predict(X_train),=y_train.index, # index are index
index=y_train.columns, # columns are column labels
columns
)= pd.DataFrame(
y_pred
model.predict(X_test),=y_test.index,
index=y_test.columns,
columns )
# Plot
= y_train.plot(color='0.25', subplots=True, sharex=True)
axs = y_test.plot(color='0.25', subplots=True, sharex=True, ax=axs)
axs = y_fit.plot(color='C0', subplots=True, sharex=True, ax=axs)
axs = y_pred.plot(color='C3', subplots=True, sharex=True, ax=axs)
axs for ax in axs: ax.legend([])
= plt.suptitle("Trends") _
Data Transformation before add XGBOOST
While the linear regression algorithm is capable of multi-output regression, the XGBoost algorithm is not. To predict multiple series at once with XGBoost, we’ll instead convert these series from wide format, with one time series per column, to long format, with series indexed by categories along rows.
= retail.stack() # pivot dataset wide to long
X
display(X.head())= X.pop('Sales') y
Sales | ||
---|---|---|
Month | Industries | |
1992-01-01 | BuildingMaterials | 8964 |
FoodAndBeverage | 29589 | |
1992-02-01 | BuildingMaterials | 9023 |
FoodAndBeverage | 28570 | |
1992-03-01 | BuildingMaterials | 10608 |
print("stack transform default retaill index \n from {var1} \n to {var2} \
( y index)".format(
=type(retail.head().index),
var1 =type(y.head().index)
var2 ))
stack transform default retaill index
from <class 'pandas.core.indexes.period.PeriodIndex'>
to <class 'pandas.core.indexes.multi.MultiIndex'> ( y index)
# Turn row labels into categorical feature columns with a label encoding
= X.reset_index('Industries')
X print("X.index is now {var1}".format(var1=type(X.index)))
# Label encoding for 'Industries' feature
for colname in X.select_dtypes(["object", "category"]):
= X[colname].factorize()
X[colname], _
# Label encoding for annual seasonality
"Month"] = X.index.month # values are 1, 2, ..., 12
X[
# Create splits
= X.loc[idx_train, :], X.loc[idx_test, :]
X_train, X_test = y.loc[idx_train], y.loc[idx_test] y_train, y_test
X.index is now <class 'pandas.core.indexes.period.PeriodIndex'>
# Pivot wide to long (stack) and convert DataFrame to Series (squeeze)
= y_fit.stack().squeeze() # trend from training set
y_fit = y_pred.stack().squeeze() # trend from test set
y_pred
print("y is now {y_index}".format(
= type(y_fit.index)
y_index
))
# Create residuals (the collection of detrended series) from the training set
= y_train - y_fit
y_resid
# Train XGBoost on the residuals
= XGBRegressor()
xgb
xgb.fit(X_train, y_resid)
# Add the predicted residuals onto the predicted trends
= xgb.predict(X_train) + y_fit
y_fit_boosted = xgb.predict(X_test) + y_pred y_pred_boosted
y is now <class 'pandas.core.indexes.multi.MultiIndex'>
= y_train.unstack(['Industries']).plot(
axs ='0.25', figsize=(11, 5), subplots=True, sharex=True,
color=['BuildingMaterials', 'FoodAndBeverage'],
title
)= y_test.unstack(['Industries']).plot(
axs ='0.25', subplots=True, sharex=True, ax=axs,
color
)= y_fit_boosted.unstack(['Industries']).plot(
axs ='C0', subplots=True, sharex=True, ax=axs,
color
)= y_pred_boosted.unstack(['Industries']).plot(
axs ='C3', subplots=True, sharex=True, ax=axs,
color
)for ax in axs: ax.legend([])
Machine Learning
from pathlib import Path
from warnings import simplefilter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
def plot_multistep(y, every=1, ax=None, palette_kwargs=None):
= dict(palette='husl', n_colors=16, desat=None)
palette_kwargs_ if palette_kwargs is not None:
palette_kwargs_.update(palette_kwargs)= sns.color_palette(**palette_kwargs_)
palette if ax is None:
= plt.subplots()
fig, ax 'color', palette))
ax.set_prop_cycle(plt.cycler(for date, preds in y[::every].iterrows():
= pd.period_range(start=date, periods=len(preds))
preds.index =ax)
preds.plot(axreturn ax
= pd.read_csv("data/ts-course/flu-trends.csv")
flu_trends
flu_trends.set_index(= 'W'),
pd.PeriodIndex(flu_trends.Week, freq = True
inplace
)"Week", axis=1, inplace=True) flu_trends.drop(
def make_lags(ts, lags, lead_time=1):
return pd.concat(
{f'y_lag_{i}': ts.shift(i)
for i in range(lead_time, lags + lead_time)
},=1)
axis
= flu_trends.FluVisits.copy() #setting this up any change to flu_trend will be copied
y = make_lags(y, lags=4).fillna(0.0) X
def make_multistep_target(ts, steps):
return pd.concat(
f'y_step_{i + 1}': ts.shift(-i)
{for i in range(steps)},
=1)
axis= make_multistep_target(y, steps=8).dropna()
y = y.align(X, join= "inner", axis = 0) # align make index of x and y homogenious y, X
to read more about align here
= train_test_split(X, y, test_size=0.25, shuffle=False)
X_train, X_test, y_train, y_test
= LinearRegression()
model
model.fit(X_train, y_train)
= pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
y_fit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns) y_pred
= mean_squared_error(y_train, y_fit, squared=False)
train_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_rmse print((f"Train RMSE: {train_rmse:.2f}\n" f"Test RMSE: {test_rmse:.2f}"))
= dict(palette='husl', n_colors=64)
palette = plt.subplots(2, 1, figsize=(11, 6))
fig, (ax1, ax2) = flu_trends.FluVisits[y_fit.index].plot(**plot_params, ax=ax1)
ax1 = plot_multistep(y_fit, ax=ax1, palette_kwargs=palette)
ax1 = ax1.legend(['FluVisits (train)', 'Forecast'])
_ = flu_trends.FluVisits[y_pred.index].plot(**plot_params, ax=ax2)
ax2 = plot_multistep(y_pred, ax=ax2, palette_kwargs=palette)
ax2 = ax2.legend(['FluVisits (test)', 'Forecast']) _
Train RMSE: 389.12
Test RMSE: 582.33
Direct Strategy - Use XGBoost regreesor
Here is short cut recursively produce mutli output regression model. > XGBoost can’t produce multiple outputs for regression tasks. But by applying the Direct reduction strategy, we can still use it to produce multi-step forecasts. This is as easy as wrapping it with scikit-learn’s MultiOutputRegressor.
from sklearn.multioutput import MultiOutputRegressor
= MultiOutputRegressor(XGBRegressor()) #cheat code
model
model.fit(X_train, y_train)
= pd.DataFrame(model.predict(X_train), index=X_train.index, columns=y.columns)
y_fit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns) y_pred
#X_train.sample(3)
3) y_train.sample(
y_step_1 | y_step_2 | y_step_3 | y_step_4 | y_step_5 | y_step_6 | y_step_7 | y_step_8 | |
---|---|---|---|---|---|---|---|---|
Week | ||||||||
2013-07-08/2013-07-14 | 11 | 13.0 | 14.0 | 10.0 | 13.0 | 13.0 | 13.0 | 22.0 |
2014-05-19/2014-05-25 | 80 | 61.0 | 47.0 | 33.0 | 27.0 | 19.0 | 22.0 | 19.0 |
2012-12-24/2012-12-30 | 2961 | 2303.0 | 2291.0 | 2258.0 | 2012.0 | 1510.0 | 1134.0 | 930.0 |
= mean_squared_error(y_train, y_fit, squared=False)
train_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_rmse print((f"Train RMSE: {train_rmse:.2f}\n" f"Test RMSE: {test_rmse:.2f}"))
= dict(palette='husl', n_colors=64)
palette = plt.subplots(2, 1, figsize=(11, 6))
fig, (ax1, ax2) = flu_trends.FluVisits[y_fit.index].plot(**plot_params, ax=ax1)
ax1 = plot_multistep(y_fit, ax=ax1, palette_kwargs=palette)
ax1 = ax1.legend(['FluVisits (train)', 'Forecast'])
_ = flu_trends.FluVisits[y_pred.index].plot(**plot_params, ax=ax2)
ax2 = plot_multistep(y_pred, ax=ax2, palette_kwargs=palette)
ax2 = ax2.legend(['FluVisits (test)', 'Forecast']) _
Train RMSE: 1.22
Test RMSE: 526.45
RegressorChain()
versus MultiOutputRegressor