EDA on Numerical Data#

%config InlineBackend.figure_format='retina'
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.WARNING)
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())
version: 0.1.33+20.g8433774.dirty
is notebook? True
is colab? False
evironment varialbles:
{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'NUM_WORKERS': 230}
data_dir = "../data/fomc"

Load preprocessed data#

econ_data = eKonf.load_data("econ_data2.parquet", data_dir)
econ_data.tail()
unscheduled forecast confcall speaker rate rate_change rate_decision rate_changed GDP GDP_diff_prev ... Rate Taylor Balanced Inertia Taylor-Rate Balanced-Rate Inertia-Rate Taylor_diff Balanced_diff Inertia_diff
date
2021-11-03 False False False Jerome Powell 0.25 0.00 0.0 0 19478.893 0.570948 ... 0.25 5.747177 4.940210 -0.528532 5.497177 4.690210 -0.778532 0.0 0.0 0.0
2021-12-15 False True False Jerome Powell 0.25 0.00 0.0 0 19478.893 0.570948 ... 0.25 6.472329 5.665362 -0.637304 6.222329 5.415362 -0.887304 0.0 0.0 0.0
2022-01-26 False False False Jerome Powell 0.25 0.00 0.0 0 19478.893 0.570948 ... 0.25 7.222928 6.415961 -0.749894 6.972928 6.165961 -0.999894 0.0 0.0 0.0
2022-03-16 False True False Jerome Powell 0.50 0.25 1.0 1 19806.290 1.680778 ... 0.25 8.499377 8.267766 -1.027665 8.249377 8.017766 -1.277665 0.0 0.0 0.0
2022-05-04 False False False Jerome Powell 1.00 0.50 1.0 1 19735.895 -0.355417 ... 0.50 8.094924 7.420939 -0.688141 7.594924 6.920939 -1.188141 0.0 0.0 0.0

5 rows × 58 columns

EDA on numerical data#

# Add previous rate decision to see inertia effect
econ_data["Rate Decision"] = econ_data["rate_decision"].map(
    lambda x: 'Cut' if x <= -1 else "Hike" if x >= 1 else "Hold"
)
econ_data["rate_decision"] = econ_data["rate_decision"].map(
    lambda x: -1 if x <= -1 else 1 if x >= 1 else 0
)
econ_data["prev_decision"] = econ_data["rate_decision"].shift(1)
econ_data["next_decision"] = econ_data["rate_decision"].shift(-1)
econ_data[["Rate Decision","rate_decision", "prev_decision", "next_decision"]].head()
Rate Decision rate_decision prev_decision next_decision
date
1982-10-05 Cut -1 NaN -1.0
1982-11-16 Cut -1 -1.0 0.0
1982-12-21 Hold 0 -1.0 0.0
1983-01-14 Hold 0 0.0 0.0
1983-01-21 Hold 0 0.0 0.0
econ_data.describe()
rate rate_change rate_decision rate_changed GDP GDP_diff_prev GDP_diff_year GDPPOT GDPPOT_diff_prev GDPPOT_diff_year ... Balanced Inertia Taylor-Rate Balanced-Rate Inertia-Rate Taylor_diff Balanced_diff Inertia_diff prev_decision next_decision
count 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 ... 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 415.000000 414.000000 414.000000
mean 3.968976 -0.019880 -0.012048 0.334940 12815.408884 0.657814 2.523898 13018.548855 0.643467 2.616803 ... 3.324700 2.893358 0.005563 -0.665962 -1.097304 0.002576 0.005238 -0.002322 -0.014493 -0.009662
std 3.036522 0.228714 0.579313 0.472539 3719.399375 1.085580 2.203739 3776.658124 0.185616 0.756947 ... 2.069550 2.373858 1.899151 2.141286 0.711029 0.058706 0.088255 0.026755 0.577867 0.577968
min 0.000000 -1.000000 -1.000000 0.000000 6804.139000 -8.937251 -9.083737 7271.207419 0.318708 1.303969 ... 0.000000 -1.027665 -4.920215 -8.061836 -2.699376 -0.400621 -0.400621 -0.425000 -1.000000 -1.000000
25% 1.000000 0.000000 0.000000 0.000000 9394.834000 0.426261 1.701703 9597.373675 0.475884 1.915027 ... 1.731034 0.561801 -1.470860 -1.661544 -1.567363 0.000000 0.000000 0.000000 0.000000 0.000000
50% 4.000000 0.000000 0.000000 0.000000 13183.890000 0.678051 2.673107 13014.429940 0.642412 2.615226 ... 3.386536 2.788555 0.052807 -0.293935 -1.130361 0.000000 0.000000 0.000000 0.000000 0.000000
75% 6.000000 0.000000 0.000000 1.000000 15781.342000 0.988107 3.908365 16227.234340 0.777802 3.150223 ... 4.723309 4.507101 1.361705 0.475418 -0.455663 0.000000 0.000000 0.000000 0.000000 0.000000
max 11.500000 1.125000 1.000000 1.000000 19806.290000 7.547535 12.226677 20003.730000 1.058177 4.280368 ... 8.267766 8.901506 8.249377 8.017766 -0.037500 0.616857 1.233715 0.060093 1.000000 1.000000

8 rows × 46 columns

econ_data.isnull().sum()
unscheduled      0
forecast         0
confcall         0
speaker          0
rate             0
                ..
Balanced_diff    0
Inertia_diff     0
Rate Decision    0
prev_decision    1
next_decision    1
Length: 61, dtype: int64

Plot rate decision count#

cfg = eKonf.compose("visualize/plot=countplot")
cfg.countplot.x = "Rate Decision"
cfg.figure.figsize = (10, 5)
cfg.ax.title = "Rate Decision Count"
eKonf.instantiate(cfg, data=econ_data)
../../../_images/748abdbdd779809b817381dc5e0e6abf51028340b1c5e596be22f0be85fd1c0d.png

Highly imbalanced to 0 (hold), so need to consider this point. Always predicting 0 (hold) will result in the accuracy of more than 60%.

Correlation#

corr_columns = [
    "rate_decision",
    "next_decision",
    "prev_decision",
    "unscheduled",
    "forecast",
    "confcall",
    "GDP_diff_prev",
    "GDP_diff_year",
    "GDPPOT_diff_prev",
    "GDPPOT_diff_year",
    "PCE_diff_prev",
    "PCE_diff_year",
    "CPI_diff_prev",
    "CPI_diff_year",
    "UNEMP",
    "UNEMP_diff_prev",
    "UNEMP_diff_year",
    "EMP",
    "EMP_diff_prev",
    "EMP_diff_year",
    "PMI",
    "PMI_diff_prev",
    "PMI_diff_year",
    "RSALES_diff_prev",
    "RSALES_diff_year",
    "HSALES_diff_prev",
    "HSALES_diff_year",
    "Taylor_diff",
    "Balanced_diff",
    "Inertia_diff",
    "rate",
    "rate_change",
    "rate_changed",
]

corr_data = econ_data[corr_columns].astype(float).corr()
cfg = eKonf.compose("visualize/plot=heatmap")
cfg.figure.figsize = (20, 12)
cfg.heatmap.cmap = "YlGnBu"
cfg.heatmap.vmin = 0
cfg.heatmap.vmax = 1
cfg.heatmap.fmt = ".2f"
cfg.ax.title = "Correlation"
eKonf.instantiate(cfg, data=corr_data)
../../../_images/bef082b6d4da338a56681659d9dd8b6527f412de16c02d3e2de73ab614d2ab29.png

Observation on the correlation:

Higher correlation with Rate Decision:

  • ‘GDP_diff_year’

  • ‘Unemp_diff_prev’

  • ‘Employ_diff_prev’

  • ‘PMI’

  • ‘RSALES_diff_year’

  • ‘HSALES_diff_year’

  • ‘prev_decision’

Will create two dataset, one full set and the other smaller set with high correlation

Correlation between Taylor rule and actual rates#

corr_columns = ["Rate", "Taylor", "Balanced", "Inertia"]
corr_data1 = econ_data[corr_columns].astype(float).corr()
corr_columns = ["rate_change", "Taylor_diff", "Balanced_diff", "Inertia_diff"]
corr_data2 = econ_data[corr_columns].astype(float).corr()
cfg = eKonf.compose("visualize/plot=heatmap")
cfg.figure.figsize = (15, 6)
cfg.subplots.ncols = 2
cfg.subplots.nrows = 1
cfg.heatmap.axno = 0
cfg.heatmap.datano = 0
cfg.heatmap.cmap = "YlGnBu"
cfg.heatmap.vmin = 0
cfg.heatmap.vmax = 1
cfg.heatmap.fmt = ".2f"
cfg.ax.title = "Fed Rates"
cfg.ax.axno = 0
heatmap2 = cfg.heatmap.copy()
heatmap2.axno = 1
heatmap2.datano = 1
ax2 = cfg.ax.copy()
ax2.title = "Rage changes"
ax2.axno = 1
cfg.plots.append(heatmap2)
cfg.axes.append(ax2)
eKonf.instantiate(cfg, data=[corr_data1, corr_data2])
../../../_images/b656c9b603bcb9a31f0995c17f7a7cdbca0def83e8f374309b0107d04fa72c1e.png
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.figure.figsize = (15, 8)
cfg.lineplot.y = corr_columns

scatter_cfg = eKonf.compose("visualize/plot/scatterplot")
scatter_cfg.x = "date"
scatter_cfg.y = "rate"
scatter_cfg.hue = "rate_decision"
scatter_cfg.secondary_y = True
cfg.plots.append(scatter_cfg)

ax2 = cfg.ax.copy()
ax2.grid = False
ax2.secondary_y = True
cfg.axes.append(ax2)

eKonf.instantiate(cfg, data=econ_data)
../../../_images/d1ef48b7623fea4c43cc92284cbac72b6d433c6fa04ea92513c7e6bd4c1ceb64.png

Compare distributions by rate decisions#

def plot_distribution(data, columns):
    for col in columns:
        cfg = eKonf.compose("visualize/plot=kdeplot")
        cfg.figure.figsize = (8, 4)

        cfg.kdeplot.x = col
        cfg.kdeplot.hue = "Rate Decision"
        cfg.kdeplot.palette = "tab10"
        cfg.ax.legend = ["Hike", "Hold", "Cut"]
        cfg.ax.grid = False
        cfg.ax.title = f"Distribution of {cfg.kdeplot.x}"
        cfg.ax.ylabel = "Frequency"
        cfg.ax.xlabel = cfg.kdeplot.x
        eKonf.instantiate(cfg, data=data)

        cfg = eKonf.compose(config_group="visualize/plot=facetgrid")
        cfg.figure.figsize = (8, 4)
        cfg.theme.palette = "pastel"
        cfg.facetgrid.col = "Rate Decision"
        cfg.facetgrid.height = 3
        cfg.facetgrid.map_dataframe._func_ = "histplot"
        cfg.facetgrid.map_dataframe.rcParams = dict(x=col, bins=50, kde=True)
        eKonf.instantiate(cfg, data=data)
plot_distribution(
    econ_data,
    ["GDP_diff_prev", "GDP_diff_year", "GDPPOT_diff_prev", "GDPPOT_diff_year"],
)
../../../_images/6e557488007513482e39ef0f762e46236d79dbc448914cbeb5d3db2225cd70c5.png ../../../_images/a850f4e13def559fd45d738ed560890d48b12aafb324a2917e1a926a61a344c0.png ../../../_images/56d117cfcc23c91bc57fa485ee611c16f88c737acb7228a7bd39e103e6e5e60a.png ../../../_images/02a89aaa91dcab028cf77a3ee6c59a1d366ca41463e4dded6ab12a60b6f7e82a.png ../../../_images/2a9ea3762cd1456e19ff825f6ceaafce7bccfd768d5cae20d0753bfc89b328f7.png ../../../_images/d7ed4af7924977d88a4420395ae3f42ad25026376a062ad315435d83b8b4edc7.png ../../../_images/0468447e3768363fccbd674deb5f468a64111bb24cca38f868e64f558c9e47e2.png ../../../_images/8ef43d2eb7a6ad259c5cd89fde7f6dddc9ed676b3060ac027e06538c390d85ca.png
plot_distribution(
    econ_data, ["PCE_diff_prev", "PCE_diff_year", "CPI_diff_prev", "CPI_diff_year"]
)
../../../_images/61934b6d91af7e5cd78c0f80578a2c97f10be507d89a82420a817bae9c5bfae2.png ../../../_images/f36b2161030635c538a53fd763029ce45437977315c935a4f247c591753ed9b7.png ../../../_images/7e44522989d7784471fe5c0ac58a96bcd678c2e6072d6bd4e3cc0f272f8cabc5.png ../../../_images/40d611156747214372d7749b1bdca21024ff827943f2d68d6ad71ae95af10601.png ../../../_images/07515dfe4bfd16174866fda4d3dc3a4f252be66b7a586032200f6ef214ae2dc2.png ../../../_images/6bade0e6c3398c1ecbaed40a37a87db075e254dbd0aaef5775b5063f426c8e52.png ../../../_images/3e03f697372dda9b1556d14bf80fd3a3e782de3d034064c365e11276eacd6af6.png ../../../_images/fbc121d1afa6401838eefce8a327ffb05435b054f29022c1e2ad4c5498c41944.png
plot_distribution(
    econ_data,
    [
        "UNEMP",
        "UNEMP_diff_prev",
        "UNEMP_diff_year",
        "EMP",
        "EMP_diff_prev",
        "EMP_diff_year",
    ],
)
../../../_images/ccbd7678c5196dc785d97ae858c41e55d49ffd50787464ce71c826fea1768f56.png ../../../_images/f92101f6dee0367af5d7321592c12ce826c992037db9bfa11503e65f2cd33402.png ../../../_images/980c9815af2579793819c6d6d3358ed3ad08317b2fb0a93d3a4e37b3dc2885d2.png ../../../_images/0db12bad1456410c1f617ccc3439918c03fad2e1e9cebbad7b82c6bd1437cad7.png ../../../_images/8a6d81af14956a517ec6a0450b7fff8198c23d819d52fac2b525fad7cf507b25.png ../../../_images/333bb59b093994270be3f80382a04ed5cd86f9eaacf5f0658b3ec6261e82e65a.png ../../../_images/ff45679ff2a7b02ec9027398536bd9809f936da27a859f3af578f3892e969e93.png ../../../_images/720034136997d7c3f371540ade5842c43d3c54766334e34fc6d9dac26a65a2bf.png ../../../_images/1a1c91296544c36757127bdd68d134b430c7cf4967fc459e6f37275ca899ab8d.png ../../../_images/3f6c60b6560c74fa3109d87d5fbfe2951ccbecbe561ee74eab1e57520983bcbe.png ../../../_images/484dffef03eda05d1eca054e7ceb3c8ec0775dfd29e141a6f5b829cb1c1da354.png ../../../_images/3cf99fe0fc7d26c0d5cabd5620151975fb851349479b08787878d57bfa338b69.png
plot_distribution(econ_data, ["PMI", "PMI_diff_prev", "PMI_diff_year"])
../../../_images/ca97c4d36f5d408127583689e57fb2b1abaa51dbbc440492c62bdfc678e053cc.png ../../../_images/c128c24cd6d437de8ed5b79fe306b79f5281e95aa07067d4bb40de77906616b7.png ../../../_images/30efe9cbba4ada5bf718a68883b98694f21a698f8fc7e772e2586bef467a0384.png ../../../_images/7ac8930dcab345c97a3465b5f03b7ef0878ec8bbe955a371b070e27c795951d5.png ../../../_images/daea4c4829368f93253ed6bf33f48dc61c1b2531acae747cda568de4997b187f.png ../../../_images/b8235482eb5aabf6df419214a42dc6bb8885d491b9e877a9bf5ff5a032a44f6c.png
plot_distribution(
    econ_data,
    ["RSALES_diff_prev", "RSALES_diff_year", "HSALES_diff_prev", "HSALES_diff_year"],
)
../../../_images/6690167d643d7697d39875d8601cc9d372f2c872fc9d43bc9353d8a4f5189420.png ../../../_images/7d7dbaf4f93f41a03c902e5adb11114ea9931806622c88d15ccde7fd24a0822b.png ../../../_images/3009b838a9a4da06aa255f09f0fb050968abb2a862e1aa24e5924bf6d0878163.png ../../../_images/d48985fafc103649092edb92d840e7cb3b81ba9d6b7669adf6037dc9f42b2907.png ../../../_images/30fdf33aafbecd34a1f080468804a0c6d937f9e16cee8a7b4cb430708d172f8e.png ../../../_images/198656bb58c8b6f255673ad72f75476cf8f613257dd3c17c7bc7ffa21a0cf867.png ../../../_images/844e4aa34c01537b92b43dd39856f4c963493db0523ebbcc8f3fed0586515eb5.png ../../../_images/0bf1e4e9795ee1b601ac11f1aa863f876466b2ff304f630d9151a5bbafc32697.png
plot_distribution(econ_data, ["Taylor_diff", "Balanced_diff", "Inertia_diff"])
../../../_images/2d16155071fb8618da119c68cf5aac2aeaffa094f5a85ce717fdc749d9e67a64.png ../../../_images/ddce65197b4296ee287f1aab0fde79e99e2b06bd0568ea32bb38a88dabb55a6b.png ../../../_images/a9e9d368976f83bc6e47e4d4d68721cf0184d2d97f87544d5ad90d7b07700e6e.png ../../../_images/d8d9c48213ec268834fd95455ae94586ca375d945ef5f414fcc6b76d0d348bda.png ../../../_images/d4fbf8e3545dcfcf49d8bde67921aa2487be428a5b723f1513475bdaf418d1c3.png ../../../_images/1d202c3e6f01de131c3237b231840be779ba733e8cb559375b6c5bfb5df43ce5.png

Create Training Data Set#

columns = [
    "Rate Decision",
    "prev_decision",
    "GDP_diff_prev",
    "PMI",
    "EMP_diff_prev",
    "RSALES_diff_year",
    "UNEMP_diff_prev",
    "HSALES_diff_year",
    "Inertia_diff",
    "Balanced_diff",
]

econ_train_small = econ_data.copy()[columns]
econ_train_small.rename(columns={"Rate Decision": "target"}, inplace=True)
print(econ_train_small.shape)
econ_train_small.tail()
(415, 10)
target prev_decision GDP_diff_prev PMI EMP_diff_prev RSALES_diff_year UNEMP_diff_prev HSALES_diff_year Inertia_diff Balanced_diff
date
2021-11-03 Hold 0.0 0.570948 60.5 0.288624 8.474656 -9.615385 -26.135217 0.0 0.0
2021-12-15 Hold 0.0 0.570948 60.6 0.437147 10.977142 -8.695652 -11.163337 0.0 0.0
2022-01-26 Hold 0.0 0.570948 58.8 0.395555 9.101289 -7.142857 -3.673938 0.0 0.0
2022-03-16 Hike 0.0 1.680778 58.6 0.476814 9.076698 -5.000000 3.125000 0.0 0.0
2022-05-04 Hike 1.0 -0.355417 57.1 0.283658 -0.034915 0.000000 -26.946848 0.0 0.0
# Large dataset
columns = [
    "Rate Decision",
    "prev_decision",
    "GDP_diff_prev",
    "GDP_diff_year",
    "GDPPOT_diff_prev",
    "GDPPOT_diff_year",
    "PCE_diff_prev",
    "PCE_diff_year",
    "CPI_diff_prev",
    "CPI_diff_year",
    "UNEMP",
    "UNEMP_diff_prev",
    "UNEMP_diff_year",
    "EMP",
    "EMP_diff_prev",
    "EMP_diff_year",
    "PMI",
    "PMI_diff_prev",
    "PMI_diff_year",
    "RSALES_diff_prev",
    "RSALES_diff_year",
    "HSALES_diff_prev",
    "HSALES_diff_year",
    "Taylor-Rate",
    "Balanced-Rate",
    "Inertia-Rate",
    "Taylor_diff",
    "Balanced_diff",
    "Inertia_diff",
]


econ_train_large = econ_data.copy()[columns]
econ_train_large.rename(columns={"Rate Decision": "target"}, inplace=True)
print(econ_train_large.shape)
(415, 29)

Missing Values#

# As most likely the decision is 0 (hold), fill prev_decision of the first row
econ_train_small["prev_decision"].fillna(0, inplace=True)
econ_train_large["prev_decision"].fillna(0, inplace=True)
# ax.set_xlim(0, 400)
cfg = eKonf.compose("visualize/plot=lineplot")
cfg.figure.figsize = (15, 8)
cfg.lineplot.x = "date"
cfg.lineplot.y = "HSALES_diff_year"

lineplot = cfg.lineplot.copy()
lineplot.x = 'date'
lineplot.y = 'RSALES_diff_year'
cfg.plots.append(lineplot)

eKonf.instantiate(cfg, data=econ_train_small)
../../../_images/c163d2564de6c56e08959789b76ea1b9f31687f2735c54365a2ddc080ff6fe8d.png
# Retail sales growth ratio is difficult to estimate. Though it is not ideal, simply use the average
econ_train_small["RSALES_diff_year"].fillna(
    econ_train_small["RSALES_diff_year"].mean(), inplace=True
)
econ_train_large["RSALES_diff_prev"].fillna(
    econ_train_large["RSALES_diff_prev"].mean(), inplace=True
)
econ_train_large["RSALES_diff_year"].fillna(
    econ_train_large["RSALES_diff_year"].mean(), inplace=True
)
econ_train_small["Inertia_diff"].fillna(
    econ_train_small["Inertia_diff"].mean(), inplace=True
)
econ_train_small["Balanced_diff"].fillna(
    econ_train_small["Balanced_diff"].mean(), inplace=True
)
econ_train_large["Inertia_diff"].fillna(
    econ_train_large["Inertia_diff"].mean(), inplace=True
)
econ_train_large["Balanced_diff"].fillna(
    econ_train_large["Balanced_diff"].mean(), inplace=True
)
econ_train_large["Taylor_diff"].fillna(
    econ_train_large["Taylor_diff"].mean(), inplace=True
)

Save Data#

eKonf.save_data(econ_train_small, "econ_train_small.parquet", data_dir)
eKonf.save_data(econ_train_large, "econ_train_large.parquet", data_dir)