Consumer Price Index Forecasting
Overview
This notebook presents a comprehensive pipeline for forecasting the Consumer Price Index (CPI), a key measure of inflation, over multiple time horizons. It generates predictions for Year-on-Year CPI at 12, 9, 6, and 3-month intervals, with corresponding visualizations to illustrate projected trends.
Clients can leverage these predictions for analytical insights on inflation trends and/or substitute this metric for any other inflation metric of interest (e.g. CPIH, RPI, etc) and execute the entire workflow as they just adjust the series ID to the one that is right for them.
Setup
Dependencies
# Standard library imports
import os
import math
from typing import Final
# Third-party imports
import numpy as np
import openpyxl
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# EvoML and API imports
import evoml_client as ec
from evoml_client.trial_conf_models import BudgetMode, SplitMethodOptions
from dotenv import load_dotenv
# Configuration
plt.style.use('seaborn-v0_8')
pio.templates.default = "plotly_white"
Configuration and Authentication
# Load environment variables
load_dotenv()
# EvoML Configuration
API_URL: Final[str] = "https://evoml.ai"
EVOML_USERNAME: Final[str] = " "
EVOML_PASSWORD: Final[str] = " "
# Connect to EvoML platform
ec.init(base_url=API_URL, username=EVOML_USERNAME, password=EVOML_PASSWORD)
True
Data Acquisition and Preprocessing
We first retrieve the downloaded data and select the relevant sheet name (Table 57, in this case) to extract the CPI summary of all items for the time period between 1988 and 2025.
# Reading the data
xls = pd.ExcelFile("consumer-price-inflation-ONS.xlsx", engine="openpyxl")
CPI_UK = pd.read_excel("consumer-price-inflation-ONS.xlsx", sheet_name="Table 57", skiprows=6, engine="openpyxl")
# Dropping columns
#print(CPI_UK['name'].tail(15)) #The last 14 rows of data are not relevant to the analysis
CPI_UK = CPI_UK.drop(CPI_UK.tail(14).index)
# Converting the time column to datetime format
CPI_UK['name'] = pd.to_datetime(CPI_UK['name'])
CPI_UK['name'] = CPI_UK['name'].dt.strftime('%Y-%m') #Removing the 00:00:00 timestamp from the date
CPI_UK.rename(columns={"name": "Date_CPI"}, inplace=True)
# Creating the df for exploratory analysis
CPI_UK = CPI_UK.copy()
cpi_df = CPI_UK[['Date_CPI', 'CPI ALL ITEMS']]
Computing Inflation Metrics
In order to retrieve a valuable estimate of inflation, we:
- Compute the 12-month rolling inflation rate from the Consumer Price index, which can be expressed with the following equation:
In order to avoid null values, we add a small offset to the product (epislon value).
- In order to ensure stationarity, we apply seasonal differecing to further detrend the data by removing annual seasonality.
We visualize both results and prepare the final dataframe for analysis.
# Small epsilon to avoid division by zero
epsilon = 1e-10
# 1. Compute 12-month rolling inflation rate
cpi_df['CPI_Annual_Change'] = (
(cpi_df['CPI ALL ITEMS'] - cpi_df['CPI ALL ITEMS'].shift(12)) /
(cpi_df['CPI ALL ITEMS'].shift(12) + epsilon) * 100
)
# First shift creates NaN values for first 12 months (Jan-Dec 1988)
# Remove initial NaN values
cpi_df = cpi_df.dropna().reset_index(drop=True).copy()
# 2. Apply seasonal differencing for stationarity
# This creates another 12 months of NaN values (Jan-Dec 1989)
cpi_df['Delta_CPI_Annual_Change'] = cpi_df['CPI_Annual_Change'].diff(12)
# Remove additional NaN values from differencing
cpi_df = cpi_df.dropna().reset_index(drop=True).copy()
print(cpi_df.head())
Date_CPI | CPI ALL ITEMS | CPI_Annual_Change | Delta_CPI_Annual_Change |
---|---|---|---|
1990-01 | 53.637 | 5.657441 | 0.760241 |
1990-02 | 53.954 | 5.877274 | 0.917541 |
1990-03 | 54.217 | 5.979514 | 0.968943 |
1990-04 | 55.211 | 6.439051 | 1.181340 |
1990-05 | 55.735 | 6.837525 | 1.509333 |
Dataset Preparation
# Create dataset for modeling
modeling_df = cpi_df[['Date_CPI', 'Delta_CPI_Annual_Change']].copy()
print(modeling_df.head())
Date_CPI | Delta_CPI_Annual_Change |
---|---|
1990-01 | 0.760241 |
1990-02 | 0.917541 |
1990-03 | 0.968943 |
1990-04 | 1.181340 |
1990-05 | 1.509333 |
Exploratory Data Analysis
Here, we visualise the results of our target variable manipulation - we compare the Year-on-year inflation rate with its seasonally differenced counterpart.
# Create EDA visualization
fig = go.Figure()
# Annual CPI change
fig.add_trace(go.Scatter(
x=cpi_df['Date_CPI'],
y=cpi_df['CPI_Annual_Change'],
mode='lines+markers',
name='Annual CPI Change (%)',
line=dict(color='blue', width=2),
marker=dict(size=4)
))
# Seasonally differenced data
fig.add_trace(go.Scatter(
x=cpi_df['Date_CPI'],
y=cpi_df['Delta_CPI_Annual_Change'],
mode='lines+markers',
name='Seasonally Differenced CPI Change',
line=dict(color='red', width=2),
marker=dict(size=4)
))
fig.update_layout(
title='CPI Inflation Analysis: Raw vs Seasonally Adjusted',
xaxis_title='Date',
yaxis_title='Inflation Rate (%)',
hovermode='x unified',
height=400,
showlegend=True
)
fig.show()
Model development
Upload Dataset to EvoML
# Upload dataset to EvoML
dataset = ec.Dataset.from_pandas(modeling_df, name="CPI_Inflation_Forecasting")
dataset.put()
dataset.wait()
print(f"Dataset URL: {API_URL}/platform/datasets/view/{dataset.dataset_id}")
Dataset URL: https://evoml.ai/platform/datasets/view/686f7b341a120cf0eaef90f9
Trial Configuration Function
We define a reusable trial configuration function which would allow us to easily execute trials with our desired horizons. We have chosen regularised regression models with the aim of ensuring better generalisation to test. In the end of each of our trial executions, we fetch the best model and its loss value.
def create_inflation_trial(dataset_id, horizon, trial_name):
"""Create and configure a time series trial for inflation forecasting."""
models = ["ridge_regressor", "lasso_regressor", "elastic_net_regressor"]
config = ec.TrialConfig.with_models(
models=models,
task=ec.MlTask.regression,
budget_mode=BudgetMode.fast,
loss_funcs=["Root Mean Squared Error"],
dataset_id=dataset_id,
is_timeseries=True,
)
# Time series specific configuration
config.options.timeSeriesWindowSize = 6
config.options.timeSeriesHorizon = horizon
config.options.splittingMethodOptions = SplitMethodOptions(
method="percentage",
trainPercentage=0.8
)
config.options.enableBudgetTuning = False
trial, _ = ec.Trial.from_dataset_id(
dataset_id,
target_col="Delta_CPI_Annual_Change",
trial_name=trial_name,
config=config,
)
return trial
# Initialize results storage
trial_results = {}
Model Training Execution
12-Month Horizon Trial
trial_12 = create_inflation_trial(
dataset.dataset_id,
horizon=12,
trial_name="Inflation_Forecast_12M"
)
trial_12.run(timeout=900)
model_12 = trial_12.get_best()
model_12.build_model()
# Store results
metrics_12 = trial_12.get_metrics_dataframe()
model_dict_12 = model_12.model_rep.__dict__
mse_12 = model_dict_12.get('metrics', {}).get('regression-mse', {}).get('test', {}).get('average')
rmse_12 = math.sqrt(mse_12) if mse_12 else 0
trial_results[12] = {
'trial': trial_12,
'model': model_12,
'model_name': model_dict_12.get('name'),
'mse_test': mse_12,
'rmse_test': rmse_12
}
print(f"Best model: {trial_results[12]['model_name']}")
print(f"RMSE: {trial_results[12]['rmse_test']:.4f}")
9-Month Horizon Trial
trial_9 = create_inflation_trial(
dataset.dataset_id,
horizon=9,
trial_name="Inflation_Forecast_9M"
)
trial_9.run(timeout=900)
model_9 = trial_9.get_best()
model_9.build_model()
# Store results
metrics_9 = trial_9.get_metrics_dataframe()
model_dict_9 = model_9.model_rep.__dict__
mse_9 = model_dict_9.get('metrics', {}).get('regression-mse', {}).get('test', {}).get('average')
rmse_9 = math.sqrt(mse_9) if mse_9 else 0
trial_results[9] = {
'trial': trial_9,
'model': model_9,
'model_name': model_dict_9.get('name'),
'mse_test': mse_9,
'rmse_test': rmse_9
}
print(f"Best model: {trial_results[9]['model_name']}")
print(f"RMSE: {trial_results[9]['rmse_test']:.4f}")
6-Month Horizon Trial
# Execute 6-month horizon trial
trial_6 = create_inflation_trial(
dataset.dataset_id,
horizon=6,
trial_name="Inflation_Forecast_6M"
)
trial_6.run(timeout=900)
model_6 = trial_6.get_best()
model_6.build_model()
# Store results
metrics_6 = trial_6.get_metrics_dataframe()
model_dict_6 = model_6.model_rep.__dict__
mse_6 = model_dict_6.get('metrics', {}).get('regression-mse', {}).get('test', {}).get('average')
rmse_6 = math.sqrt(mse_6) if mse_6 else 0
trial_results[6] = {
'trial': trial_6,
'model': model_6,
'model_name': model_dict_6.get('name'),
'mse_test': mse_6,
'rmse_test': rmse_6
}
print(f"Best model: {trial_results[6]['model_name']}")
print(f"RMSE: {trial_results[6]['rmse_test']:.4f}")
3-Month Horizon Trial
trial_3 = create_inflation_trial(
dataset.dataset_id,
horizon=3,
trial_name="Inflation_Forecast_3M"
)
trial_3.run(timeout=900)
model_3 = trial_3.get_best()
model_3.build_model()
# Store results
metrics_3 = trial_3.get_metrics_dataframe()
model_dict_3 = model_3.model_rep.__dict__
mse_3 = model_dict_3.get('metrics', {}).get('regression-mse', {}).get('test', {}).get('average')
rmse_3 = math.sqrt(mse_3) if mse_3 else 0
trial_results[3] = {
'trial': trial_3,
'model': model_3,
'model_name': model_dict_3.get('name'),
'mse_test': mse_3,
'rmse_test': rmse_3
}
print(f"Best model: {trial_results[3]['model_name']}")
print(f"RMSE: {trial_results[3]['rmse_test']:.4f}")
Results Summary
We store the results to later retrieve them for visualisations and further computation.
results_summary = []
for horizon in sorted(trial_results.keys()):
results_summary.append({
'Horizon (Months)': horizon,
'Best Model': trial_results[horizon]['model_name'],
'Test RMSE': f"{trial_results[horizon]['rmse_test']:.4f}",
'Test MSE': f"{trial_results[horizon]['mse_test']:.4f}"
})
results_df = pd.DataFrame(results_summary)
print(results_df)
Horizon (Months) | Best Model | Test RMSE | Test MSE |
---|---|---|---|
3 | ridge_regressor-04a45 | 1.2623 | 1.5935 |
6 | ridge_regressor-04a45 | 1.5780 | 2.4901 |
9 | elastic_net_regressor-d3b5d | 1.9234 | 3.6996 |
12 | elastic_net_regressor-d3b5d | 1.9236 | 3.7001 |
Prediction Generation
Preparing test data for predictions
Here we generate predictions and back-transform them to their original year-on-year inflation scale, for interpretability. For this reason, here, we mirror EvoML's preprocessing split.
# Split data for predictions
split_idx = int(len(modeling_df) * 0.8)
test_data = modeling_df.iloc[split_idx:].copy()
train_data = modeling_df.iloc[:split_idx].copy()
# Also prepare visualization data (with original CPI values)
viz_split_idx = int(len(cpi_df) * 0.8)
viz_data = cpi_df.iloc[viz_split_idx:].copy()
Generating Predictions for Each Horizon
We fetch the best trained model for each horizon, we extend the dataset with the respective periods necessary for each trial and we generate predictions.
# Generate predictions for all horizons
predictions = {}
for horizon in [3, 6, 9, 12]:
# Get the trained model
model = trial_results[horizon]['model']
# Extend test data with future periods
last_date = pd.to_datetime(test_data['Date_CPI'].max())
future_dates = pd.date_range(
start=last_date + pd.DateOffset(months=1),
periods=horizon,
freq='M'
)
future_entries = pd.DataFrame({
'Date_CPI': future_dates.strftime('%Y-%m'),
'Delta_CPI_Annual_Change': [0] * len(future_dates)
})
extended_test_data = pd.concat([test_data, future_entries], ignore_index=True)
# Generate predictions
raw_predictions = model.predict(data=extended_test_data)
# Store results
predictions[horizon] = {
'extended_data': extended_test_data,
'raw_predictions': raw_predictions
}
Back-Transforming Predictions to Original Scale
Here, we back-transform our dependent variable to its original scale, as we take the last 12 values before the slice of the testing set and add back their 12-month differences. We do this so we could arrive at an interpretable inflation rate for our predicted-actual comparison.
### Back-transforming Seasonally Differenced Predictions to Original Inflation Rates
# Get last 12 values from training data for back-transformation base
train_cpi_data = cpi_df.iloc[:viz_split_idx]
last_12_values = train_cpi_data.tail(12)['CPI_Annual_Change'].reset_index(drop=True)
for horizon in [3, 6, 9, 12]:
raw_preds = predictions[horizon]['raw_predictions']
# Convert raw predictions list to pandas Series
raw_preds_series = pd.Series(raw_preds)
# Combine last 12 values with predictions
extended_series = pd.concat([last_12_values, raw_preds_series], ignore_index=True)
# Back-transform using seasonal differencing reversal
back_transformed = extended_series.copy()
for t in range(12, len(extended_series)):
back_transformed[t] = extended_series[t] + back_transformed[t-12]
# Store back-transformed predictions
predictions[horizon]['back_transformed'] = back_transformed
Visualizations
First, we visualise a predicted vs actual plot to represent our models' performance on the testing set.
# Prepare data for plotting
plot_data = pd.concat([
viz_data[['Date_CPI', 'CPI_Annual_Change']].rename(columns={'CPI_Annual_Change': 'Annual_Change'}),
pd.DataFrame({
'Date_CPI': predictions[12]['extended_data']['Date_CPI'],
'Annual_Change': predictions[12]['back_transformed']
}),
pd.DataFrame({
'Date_CPI': predictions[9]['extended_data']['Date_CPI'],
'Annual_Change': predictions[9]['back_transformed']
}),
pd.DataFrame({
'Date_CPI': predictions[6]['extended_data']['Date_CPI'],
'Annual_Change': predictions[6]['back_transformed']
}),
pd.DataFrame({
'Date_CPI': predictions[3]['extended_data']['Date_CPI'],
'Annual_Change': predictions[3]['back_transformed']
})
], keys=['Actual', '12-Month Prediction', '9-Month Prediction', '6-Month Prediction', '3-Month Prediction']).reset_index(level=0).rename(columns={'level_0': 'Type'})
# Create figure
fig = go.Figure()
# Add actual data
actual_data = plot_data[plot_data['Type'] == 'Actual']
fig.add_trace(go.Scatter(
x=actual_data['Date_CPI'],
y=actual_data['Annual_Change'],
mode='lines',
name='Actual'
))
# Add predictions
for prediction_type in ['12-Month Prediction', '9-Month Prediction', '6-Month Prediction', '3-Month Prediction']:
prediction_data = plot_data[plot_data['Type'] == prediction_type]
fig.add_trace(go.Scatter(
x=prediction_data['Date_CPI'],
y=prediction_data['Annual_Change'],
mode='lines',
name=prediction_type,
opacity=0.5
))
# Add vertical lines for forecast periods
fig.add_shape(
type="line",
x0="2025-01-01", y0=0, x1="2025-01-01", y1=1,
xref='x', yref='paper', opacity=0.5,
line=dict(color="Black", width=1, dash="dash")
)
fig.add_shape(
type="line",
x0="2026-01-01", y0=0, x1="2026-01-01", y1=1,
xref='x', yref='paper', opacity=0.5,
line=dict(color="Black", width=1, dash="dash")
)
# Update layout
fig.update_layout(
height=600,
width=1200,
title_text="Annual CPI Inflation Rate with Predictions (12-Month, 9-Month, 6-Month, and 3-Month)",
xaxis_title="Date",
yaxis_title="Annual CPI Change",
legend_title="Data Type",
plot_bgcolor='white'
)
fig.update_xaxes(tickangle=45, showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()
Summary Forecast with Confidence Intervals
The final plot consistutes an output summary of our trials, representing the last prediction point each of our best selected models generate for their respective time window (12,9,6, and 3). Here, we calculate the confidence intervals using the loss value from each of our models' evaluations as a measure of prediction uncertainty, multiplied by a z-score of 1.96 (we assume residuals are normally distributed):
where is the predicted value and RMSE is our loss function we chose.
# Create summary forecast plot with confidence intervals
fig = go.Figure()
# Add historical data up to Jan 2025
historical_data = viz_data[viz_data['Date_CPI'] <= '2025-01-01']
fig.add_trace(go.Scatter(
x=historical_data['Date_CPI'],
y=historical_data['CPI_Annual_Change'],
mode='lines',
name='Historical Data',
line=dict(color='blue')
))
# Extract key prediction points for summary
target_dates = {
3: '2025-04-01',
6: '2025-07-01',
9: '2025-10-01',
12: '2026-01-01'
}
# Get last actual value
last_actual_df = viz_data[viz_data['Date_CPI'] == '2025-01-01']
if len(last_actual_df) > 0:
last_actual = last_actual_df['CPI_Annual_Change'].values[0]
else:
last_actual = viz_data.sort_values('Date_CPI', ascending=False)['CPI_Annual_Change'].iloc[0]
summary_points = [
{
'horizon': 0,
'date': pd.to_datetime('2025-01-01'),
'prediction': last_actual,
'ci_lower': last_actual,
'ci_upper': last_actual
}
]
# Calculate confidence intervals for each horizon using RMSE from model evaluation
for horizon, target_date in target_dates.items():
extended_data = predictions[horizon]['extended_data']
back_transformed = predictions[horizon]['back_transformed']
# Use RMSE from model evaluation as confidence interval measure
rmse = trial_results[horizon]['rmse_test']
# Use 1.96 for 95% confidence interval assuming normally distributed residuals
z_score = 1.96
# Find closest date match for the target prediction
date_matches = extended_data[extended_data['Date_CPI'].str.startswith(target_date[:7])]
if len(date_matches) > 0:
idx = date_matches.index[0]
prediction = back_transformed[idx]
# Calculate confidence interval bounds using RMSE
ci_lower = prediction - (z_score * rmse)
ci_upper = prediction + (z_score * rmse)
summary_points.append({
'horizon': horizon,
'date': pd.to_datetime(target_date),
'prediction': prediction,
'ci_lower': ci_lower,
'ci_upper': ci_upper
})
summary_df = pd.DataFrame(summary_points)
# Add upper bound trace
fig.add_trace(go.Scatter(
x=summary_df['date'],
y=summary_df['ci_upper'],
mode='lines',
line=dict(width=0),
name='Upper Bound',
showlegend=False,
fillcolor='rgba(255, 0, 0, 0.2)' # Increased opacity from 0.1 to 0.2
))
# Add lower bound trace with fill
fig.add_trace(go.Scatter(
x=summary_df['date'],
y=summary_df['ci_lower'],
mode='lines',
line=dict(width=0),
fill='tonexty',
fillcolor='rgba(255, 0, 0, 0.2)',
name='95% Confidence Interval',
showlegend=True
))
# Add prediction points with error bars
fig.add_trace(go.Scatter(
x=summary_df['date'],
y=summary_df['prediction'],
mode='lines+markers+text',
name='Forecasts',
line=dict(color='red', dash='dash'),
marker=dict(size=10, color='red'),
text=summary_df['date'].dt.strftime('%b'),
textposition='top center',
error_y=dict(
type='data',
array=summary_df['ci_upper'] - summary_df['prediction'],
arrayminus=summary_df['prediction'] - summary_df['ci_lower'],
visible=True,
color='grey',
thickness=1.5,
width=3
)
))
# Add vertical lines for forecast periods
fig.add_shape(
type="line",
x0="2025-01-01", y0=0, x1="2025-01-01", y1=1,
xref='x', yref='paper', opacity=0.5,
line=dict(color="Black", width=1, dash="dash")
)
fig.add_shape(
type="line",
x0="2026-01-01", y0=0, x1="2026-01-01", y1=1,
xref='x', yref='paper', opacity=0.5,
line=dict(color="Black", width=1, dash="dash")
)
fig.update_layout(
height=600,
width=1200,
title_text="Annual CPI Inflation Rate Summary with 95% Confidence Intervals",
xaxis_title="Date",
yaxis_title="Annual CPI Change",
legend_title="Data Type",
plot_bgcolor='white'
)
fig.update_xaxes(tickangle=45, showgrid=True)
fig.update_yaxes(showgrid=True)
fig.show()