{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting Sentiments of FOMC Corpus\n", "\n", "Analyse statement by Loughran and McDonald dictionary, finbert, and T5 model" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "version: 0.1.33+28.g90d1dea\n", "is notebook? True\n", "is colab? False\n", "evironment varialbles:\n", "{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',\n", " 'EKORPKIT_DATA_DIR': None,\n", " 'EKORPKIT_PROJECT': 'ekorpkit-book',\n", " 'EKORPKIT_WORKSPACE_ROOT': '/workspace',\n", " 'NUM_WORKERS': 230}\n" ] } ], "source": [ "%config InlineBackend.figure_format='retina'\n", "from ekorpkit import eKonf\n", "\n", "logging.basicConfig(level=logging.WARNING)\n", "print(\"version:\", eKonf.__version__)\n", "print(\"is notebook?\", eKonf.is_notebook())\n", "print(\"is colab?\", eKonf.is_colab())\n", "print(\"evironment varialbles:\")\n", "eKonf.print(eKonf.env().dict())" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pydantic.types.SecretStr" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "start_year = 2000\n", "data_dir = \"../data/fomc\"\n", "eKonf.env().FRED_API_KEY" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiments with the LM sentiment analyser" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load FOMC Corpus" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtextsplittimestampcontent_typedatespeakertitledecisionraterecent_meetingrecent_decisionrecent_ratenext_meetingnext_decisionnext_ratetext_num_wordssection_idsent_id
6534632854It will not have the word “somewhat” on line 3.train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.252014-12-170.00.252015-01-280.00.25102872
6534642854Chair Yellen Yes Vice Chairman Dudley ...train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.252014-12-170.00.252015-01-280.00.25312874
6534652854And let me confirm that the next meeting will ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.252014-12-170.00.252015-01-280.00.25192883
6534662854I believe box lunches are now available for pe...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.252014-12-170.00.252015-01-280.00.25332884
6534672854I will do my best, and I will consider at the ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.252014-12-170.00.252015-01-280.00.25182885
\n", "
" ], "text/plain": [ " id text split \\\n", "653463 2854 It will not have the word “somewhat” on line 3. train \n", "653464 2854 Chair Yellen Yes Vice Chairman Dudley ... train \n", "653465 2854 And let me confirm that the next meeting will ... train \n", "653466 2854 I believe box lunches are now available for pe... train \n", "653467 2854 I will do my best, and I will consider at the ... train \n", "\n", " timestamp content_type date speaker \\\n", "653463 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653464 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653465 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653466 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653467 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "\n", " title decision rate recent_meeting \\\n", "653463 FOMC Meeting Transcript 0.0 0.25 2014-12-17 \n", "653464 FOMC Meeting Transcript 0.0 0.25 2014-12-17 \n", "653465 FOMC Meeting Transcript 0.0 0.25 2014-12-17 \n", "653466 FOMC Meeting Transcript 0.0 0.25 2014-12-17 \n", "653467 FOMC Meeting Transcript 0.0 0.25 2014-12-17 \n", "\n", " recent_decision recent_rate next_meeting next_decision next_rate \\\n", "653463 0.0 0.25 2015-01-28 0.0 0.25 \n", "653464 0.0 0.25 2015-01-28 0.0 0.25 \n", "653465 0.0 0.25 2015-01-28 0.0 0.25 \n", "653466 0.0 0.25 2015-01-28 0.0 0.25 \n", "653467 0.0 0.25 2015-01-28 0.0 0.25 \n", "\n", " text_num_words section_id sent_id \n", "653463 10 287 2 \n", "653464 31 287 4 \n", "653465 19 288 3 \n", "653466 33 288 4 \n", "653467 18 288 5 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fomc_sents = eKonf.load_data(\"fomc_sents.parquet\", data_dir)\n", "fomc_sents.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Predict sentiments of sentences" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "model_cfg = eKonf.compose('model/sentiment=lm')\n", "model_cfg.num_workers = 100\n", "lmsa = eKonf.instantiate(model_cfg)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{'num_tokens': 156,\n", " 'polarity': -0.9999990000010001,\n", " 'polarity_label': 'negative',\n", " 'uncertainty': 1e-06}" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "article = fomc_sents.text[10]\n", "lmsa.predict_sentence(article)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:ekorpkit.models.sentiment.lbsa:Predicting sentiments of the column [text] using predict_sentence\n", "INFO:ekorpkit.base:Using batcher with minibatch size: 1000\n", "INFO:ekorpkit.utils.batch.batcher: backend: joblib minibatch_size: 1000 procs: 100 input_split: False merge_output: True len(data): 653468 len(args): 5\n", "Predicting [text]: 100%|██████████| 654/654 [02:50<00:00, 3.83it/s]\n", "INFO:ekorpkit.models.sentiment.lbsa: >> elapsed time to predict: 0:02:51.902053\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtextsplittimestampcontent_typedatespeakertitledecisionrate...next_meetingnext_decisionnext_ratetext_num_wordssection_idsent_idnum_tokenspolaritypolarity_labeluncertainty
00The Secretary reported that advices of the ele...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.0047290520.000000neutral0.000001
10By unanimous vote, the Committee elected the f...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.007335078-1.000000negative0.000001
20By unanimous vote, William J. McDonough, Marga...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.0074370831.000000positive0.000001
30On January 15, 1993, the continuing rules, reg...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.005939068-0.333333negative0.014707
40Members were asked to indicate if they wished ...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.002539126-0.999999negative0.000001
..................................................................
6534632854It will not have the word “somewhat” on line 3.train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.25...2015-01-280.00.25102872130.000000neutral0.076924
6534642854Chair Yellen Yes Vice Chairman Dudley ...train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.25...2015-01-280.00.25312874310.000000neutral0.000001
6534652854And let me confirm that the next meeting will ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...2015-01-280.00.25192883210.000000neutral0.000001
6534662854I believe box lunches are now available for pe...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...2015-01-280.00.25332884390.000000neutral0.025642
6534672854I will do my best, and I will consider at the ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...2015-01-280.00.25182885220.000000neutral0.000001
\n", "

653468 rows × 23 columns

\n", "
" ], "text/plain": [ " id text split \\\n", "0 0 The Secretary reported that advices of the ele... train \n", "1 0 By unanimous vote, the Committee elected the f... train \n", "2 0 By unanimous vote, William J. McDonough, Marga... train \n", "3 0 On January 15, 1993, the continuing rules, reg... train \n", "4 0 Members were asked to indicate if they wished ... train \n", "... ... ... ... \n", "653463 2854 It will not have the word “somewhat” on line 3. train \n", "653464 2854 Chair Yellen Yes Vice Chairman Dudley ... train \n", "653465 2854 And let me confirm that the next meeting will ... train \n", "653466 2854 I believe box lunches are now available for pe... train \n", "653467 2854 I will do my best, and I will consider at the ... train \n", "\n", " timestamp content_type date speaker \\\n", "0 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "1 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "2 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "3 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "4 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "... ... ... ... ... \n", "653463 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653464 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653465 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653466 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653467 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "\n", " title decision rate ... next_meeting \\\n", "0 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "1 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "2 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "3 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "4 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "... ... ... ... ... ... \n", "653463 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653464 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653465 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653466 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653467 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "\n", " next_decision next_rate text_num_words section_id sent_id \\\n", "0 0.0 3.00 47 29 0 \n", "1 0.0 3.00 73 35 0 \n", "2 0.0 3.00 74 37 0 \n", "3 0.0 3.00 59 39 0 \n", "4 0.0 3.00 25 39 1 \n", "... ... ... ... ... ... \n", "653463 0.0 0.25 10 287 2 \n", "653464 0.0 0.25 31 287 4 \n", "653465 0.0 0.25 19 288 3 \n", "653466 0.0 0.25 33 288 4 \n", "653467 0.0 0.25 18 288 5 \n", "\n", " num_tokens polarity polarity_label uncertainty \n", "0 52 0.000000 neutral 0.000001 \n", "1 78 -1.000000 negative 0.000001 \n", "2 83 1.000000 positive 0.000001 \n", "3 68 -0.333333 negative 0.014707 \n", "4 26 -0.999999 negative 0.000001 \n", "... ... ... ... ... \n", "653463 13 0.000000 neutral 0.076924 \n", "653464 31 0.000000 neutral 0.000001 \n", "653465 21 0.000000 neutral 0.000001 \n", "653466 39 0.000000 neutral 0.025642 \n", "653467 22 0.000000 neutral 0.000001 \n", "\n", "[653468 rows x 23 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fomc_sent_sentiments = lmsa.predict(fomc_sents)\n", "eKonf.save_data(fomc_sent_sentiments, \"fomc_sent_sentiments.parquet\", data_dir)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtextsplittimestampcontent_typedatespeakertitledecisionrate...next_meetingnext_decisionnext_ratetext_num_wordssection_idsent_idnum_tokenspolaritypolarity_labeluncertainty
00The Secretary reported that advices of the ele...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.0047290520.000000neutral0.000001
10By unanimous vote, the Committee elected the f...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.007335078-1.000000negative0.000001
20By unanimous vote, William J. McDonough, Marga...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.0074370831.000000positive0.000001
30On January 15, 1993, the continuing rules, reg...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.005939068-0.333333negative0.014707
40Members were asked to indicate if they wished ...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...1993-02-180.03.002539126-0.999999negative0.000001
..................................................................
6534632854It will not have the word “somewhat” on line 3.train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.25...2015-01-280.00.25102872130.000000neutral0.076924
6534642854Chair Yellen Yes Vice Chairman Dudley ...train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.25...2015-01-280.00.25312874310.000000neutral0.000001
6534652854And let me confirm that the next meeting will ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...2015-01-280.00.25192883210.000000neutral0.000001
6534662854I believe box lunches are now available for pe...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...2015-01-280.00.25332884390.000000neutral0.025642
6534672854I will do my best, and I will consider at the ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...2015-01-280.00.25182885220.000000neutral0.000001
\n", "

653468 rows × 23 columns

\n", "
" ], "text/plain": [ " id text split \\\n", "0 0 The Secretary reported that advices of the ele... train \n", "1 0 By unanimous vote, the Committee elected the f... train \n", "2 0 By unanimous vote, William J. McDonough, Marga... train \n", "3 0 On January 15, 1993, the continuing rules, reg... train \n", "4 0 Members were asked to indicate if they wished ... train \n", "... ... ... ... \n", "653463 2854 It will not have the word “somewhat” on line 3. train \n", "653464 2854 Chair Yellen Yes Vice Chairman Dudley ... train \n", "653465 2854 And let me confirm that the next meeting will ... train \n", "653466 2854 I believe box lunches are now available for pe... train \n", "653467 2854 I will do my best, and I will consider at the ... train \n", "\n", " timestamp content_type date speaker \\\n", "0 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "1 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "2 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "3 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "4 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "... ... ... ... ... \n", "653463 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653464 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653465 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653466 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653467 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "\n", " title decision rate ... next_meeting \\\n", "0 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "1 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "2 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "3 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "4 FOMC Meeting Minutes 0.0 3.00 ... 1993-02-18 \n", "... ... ... ... ... ... \n", "653463 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653464 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653465 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653466 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "653467 FOMC Meeting Transcript 0.0 0.25 ... 2015-01-28 \n", "\n", " next_decision next_rate text_num_words section_id sent_id \\\n", "0 0.0 3.00 47 29 0 \n", "1 0.0 3.00 73 35 0 \n", "2 0.0 3.00 74 37 0 \n", "3 0.0 3.00 59 39 0 \n", "4 0.0 3.00 25 39 1 \n", "... ... ... ... ... ... \n", "653463 0.0 0.25 10 287 2 \n", "653464 0.0 0.25 31 287 4 \n", "653465 0.0 0.25 19 288 3 \n", "653466 0.0 0.25 33 288 4 \n", "653467 0.0 0.25 18 288 5 \n", "\n", " num_tokens polarity polarity_label uncertainty \n", "0 52 0.000000 neutral 0.000001 \n", "1 78 -1.000000 negative 0.000001 \n", "2 83 1.000000 positive 0.000001 \n", "3 68 -0.333333 negative 0.014707 \n", "4 26 -0.999999 negative 0.000001 \n", "... ... ... ... ... \n", "653463 13 0.000000 neutral 0.076924 \n", "653464 31 0.000000 neutral 0.000001 \n", "653465 21 0.000000 neutral 0.000001 \n", "653466 39 0.000000 neutral 0.025642 \n", "653467 22 0.000000 neutral 0.000001 \n", "\n", "[653468 rows x 23 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fomc_sent_sentiments = eKonf.load_data(\"fomc_sent_sentiments.parquet\", data_dir)\n", "fomc_sent_sentiments" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Aggregate sentiment scores" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "fomc_tones_lm = lmsa.aggregate_scores(fomc_sent_sentiments, groupby=['content_type', 'date'])\n", "fomc_tones_lm.content_type = fomc_tones_lm.content_type.str.replace('fomc_', '')\n", "eKonf.save_data(fomc_tones_lm, 'fomc_tones_lm.parquet', data_dir)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
content_typedatepolarity_meanpolarity_diffusionpositivenegativenum_tokens_sumnum_tokens_meannum_tokens_mediannum_examplespolarity_mean_labelpolarity_diffusion_label
0beigebook2021-01-13-0.068247-0.0710533972522939785522.48911921.017691neutralneutral
1beigebook2021-03-03-0.032683-0.0373843887451838130022.59020121.016879neutralneutral
2beigebook2021-04-14-0.030535-0.0350853568410034012122.43098321.015163neutralneutral
3beigebook2021-06-02-0.039873-0.0447603837456135774522.11715621.016175neutralneutral
4beigebook2021-07-140.0117430.0121951911821580321.41327921.0738neutralneutral
.......................................
2390testimony2021-05-19-0.171642-0.1641791526259038.65671629.067negativenegative
2391testimony2021-06-220.2201260.207547209145627.47169824.053positivepositive
2392testimony2021-07-140.2592590.305556143106629.61111127.036positivepositive
2393testimony2021-09-280.0317460.00000099102624.42857122.042neutralneutral
2394testimony2021-11-30-0.120000-0.2000003755627.80000026.020neutralnegative
\n", "

2395 rows × 12 columns

\n", "
" ], "text/plain": [ " content_type date polarity_mean polarity_diffusion positive \\\n", "0 beigebook 2021-01-13 -0.068247 -0.071053 3972 \n", "1 beigebook 2021-03-03 -0.032683 -0.037384 3887 \n", "2 beigebook 2021-04-14 -0.030535 -0.035085 3568 \n", "3 beigebook 2021-06-02 -0.039873 -0.044760 3837 \n", "4 beigebook 2021-07-14 0.011743 0.012195 191 \n", "... ... ... ... ... ... \n", "2390 testimony 2021-05-19 -0.171642 -0.164179 15 \n", "2391 testimony 2021-06-22 0.220126 0.207547 20 \n", "2392 testimony 2021-07-14 0.259259 0.305556 14 \n", "2393 testimony 2021-09-28 0.031746 0.000000 9 \n", "2394 testimony 2021-11-30 -0.120000 -0.200000 3 \n", "\n", " negative num_tokens_sum num_tokens_mean num_tokens_median \\\n", "0 5229 397855 22.489119 21.0 \n", "1 4518 381300 22.590201 21.0 \n", "2 4100 340121 22.430983 21.0 \n", "3 4561 357745 22.117156 21.0 \n", "4 182 15803 21.413279 21.0 \n", "... ... ... ... ... \n", "2390 26 2590 38.656716 29.0 \n", "2391 9 1456 27.471698 24.0 \n", "2392 3 1066 29.611111 27.0 \n", "2393 9 1026 24.428571 22.0 \n", "2394 7 556 27.800000 26.0 \n", "\n", " num_examples polarity_mean_label polarity_diffusion_label \n", "0 17691 neutral neutral \n", "1 16879 neutral neutral \n", "2 15163 neutral neutral \n", "3 16175 neutral neutral \n", "4 738 neutral neutral \n", "... ... ... ... \n", "2390 67 negative negative \n", "2391 53 positive positive \n", "2392 36 positive positive \n", "2393 42 neutral neutral \n", "2394 20 neutral negative \n", "\n", "[2395 rows x 12 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fomc_tones_lm = eKonf.load_data('fomc_tones_lm.parquet', data_dir)\n", "fomc_tones_lm" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "cfg = eKonf.compose('pipeline/pivot')\n", "cfg.index = 'date'\n", "cfg.columns = 'content_type'\n", "cfg.values = ['polarity_mean', 'polarity_diffusion', 'num_examples', 'num_tokens_sum', 'num_tokens_mean']\n", "tone_data_lm = eKonf.pipe(fomc_tones_lm, cfg)\n", "tone_data_lm = eKonf.to_datetime(tone_data_lm, _columns='date')\n", "tone_data_lm = tone_data_lm.set_index('date')\n", "eKonf.save_data(tone_data_lm, 'fomc_tone_data_lm.parquet', data_dir)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
polarity_mean_beigebookpolarity_mean_meeting_scriptpolarity_mean_minutespolarity_mean_press_confpolarity_mean_speechpolarity_mean_statementpolarity_mean_testimonypolarity_diffusion_beigebookpolarity_diffusion_meeting_scriptpolarity_diffusion_minutes...num_tokens_sum_speechnum_tokens_sum_statementnum_tokens_sum_testimonynum_tokens_mean_beigebooknum_tokens_mean_meeting_scriptnum_tokens_mean_minutesnum_tokens_mean_press_confnum_tokens_mean_speechnum_tokens_mean_statementnum_tokens_mean_testimony
date
1990-02-07NaN-0.087583NaNNaNNaNNaNNaNNaN-0.095663NaN...NaNNaNNaNNaN30.213010NaNNaNNaNNaNNaN
1990-03-27NaN-0.171992NaNNaNNaNNaNNaNNaN-0.179702NaN...NaNNaNNaNNaN29.846369NaNNaNNaNNaNNaN
1990-05-15NaN-0.116052NaNNaNNaNNaNNaNNaN-0.125461NaN...NaNNaNNaNNaN29.749077NaNNaNNaNNaNNaN
1990-07-03NaN-0.114829NaNNaNNaNNaNNaNNaN-0.117794NaN...NaNNaNNaNNaN29.667920NaNNaNNaNNaNNaN
1990-08-21NaN-0.209552NaNNaNNaNNaNNaNNaN-0.219403NaN...NaNNaNNaNNaN31.032836NaNNaNNaNNaNNaN
..................................................................
2021-11-30NaNNaNNaNNaN-0.167014NaN-0.12NaNNaNNaN...3066.0NaN556.0NaNNaNNaNNaN31.937500NaN27.8
2021-12-01-0.046022NaNNaNNaNNaNNaNNaN-0.048109NaNNaN...NaNNaNNaN22.539497NaNNaNNaNNaNNaNNaN
2021-12-02NaNNaNNaNNaN-0.077381NaNNaNNaNNaNNaN...6514.0NaNNaNNaNNaNNaNNaN36.188889NaNNaN
2021-12-15NaNNaN-0.043929-0.075441NaN0.166667NaNNaNNaN-0.064286...NaN489.0NaNNaNNaN30.52142937.587413NaN27.166667NaN
2021-12-17NaNNaNNaNNaN-0.356613NaNNaNNaNNaNNaN...3694.0NaNNaNNaNNaNNaNNaN29.317460NaNNaN
\n", "

1876 rows × 35 columns

\n", "
" ], "text/plain": [ " polarity_mean_beigebook polarity_mean_meeting_script \\\n", "date \n", "1990-02-07 NaN -0.087583 \n", "1990-03-27 NaN -0.171992 \n", "1990-05-15 NaN -0.116052 \n", "1990-07-03 NaN -0.114829 \n", "1990-08-21 NaN -0.209552 \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 -0.046022 NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 NaN NaN \n", "2021-12-17 NaN NaN \n", "\n", " polarity_mean_minutes polarity_mean_press_conf \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 -0.043929 -0.075441 \n", "2021-12-17 NaN NaN \n", "\n", " polarity_mean_speech polarity_mean_statement \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 -0.167014 NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 -0.077381 NaN \n", "2021-12-15 NaN 0.166667 \n", "2021-12-17 -0.356613 NaN \n", "\n", " polarity_mean_testimony polarity_diffusion_beigebook \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 -0.12 NaN \n", "2021-12-01 NaN -0.048109 \n", "2021-12-02 NaN NaN \n", "2021-12-15 NaN NaN \n", "2021-12-17 NaN NaN \n", "\n", " polarity_diffusion_meeting_script polarity_diffusion_minutes \\\n", "date \n", "1990-02-07 -0.095663 NaN \n", "1990-03-27 -0.179702 NaN \n", "1990-05-15 -0.125461 NaN \n", "1990-07-03 -0.117794 NaN \n", "1990-08-21 -0.219403 NaN \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 NaN -0.064286 \n", "2021-12-17 NaN NaN \n", "\n", " ... num_tokens_sum_speech num_tokens_sum_statement \\\n", "date ... \n", "1990-02-07 ... NaN NaN \n", "1990-03-27 ... NaN NaN \n", "1990-05-15 ... NaN NaN \n", "1990-07-03 ... NaN NaN \n", "1990-08-21 ... NaN NaN \n", "... ... ... ... \n", "2021-11-30 ... 3066.0 NaN \n", "2021-12-01 ... NaN NaN \n", "2021-12-02 ... 6514.0 NaN \n", "2021-12-15 ... NaN 489.0 \n", "2021-12-17 ... 3694.0 NaN \n", "\n", " num_tokens_sum_testimony num_tokens_mean_beigebook \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 556.0 NaN \n", "2021-12-01 NaN 22.539497 \n", "2021-12-02 NaN NaN \n", "2021-12-15 NaN NaN \n", "2021-12-17 NaN NaN \n", "\n", " num_tokens_mean_meeting_script num_tokens_mean_minutes \\\n", "date \n", "1990-02-07 30.213010 NaN \n", "1990-03-27 29.846369 NaN \n", "1990-05-15 29.749077 NaN \n", "1990-07-03 29.667920 NaN \n", "1990-08-21 31.032836 NaN \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 NaN 30.521429 \n", "2021-12-17 NaN NaN \n", "\n", " num_tokens_mean_press_conf num_tokens_mean_speech \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 NaN 31.937500 \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN 36.188889 \n", "2021-12-15 37.587413 NaN \n", "2021-12-17 NaN 29.317460 \n", "\n", " num_tokens_mean_statement num_tokens_mean_testimony \n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 NaN 27.8 \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 27.166667 NaN \n", "2021-12-17 NaN NaN \n", "\n", "[1876 rows x 35 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tone_data_lm = eKonf.load_data('fomc_tone_data_lm.parquet', data_dir)\n", "tone_data_lm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiments and aggregate scores with a pipeline" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:ekorpkit.io.file:Processing [1] files from ['fomc_sents.parquet']\n", "INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/fomc/fomc_sents.parquet']\n", "INFO:ekorpkit.io.file:Loading data from ../data/fomc/fomc_sents.parquet\n", "INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('predict', 'predict'), ('aggregate_scores', 'aggregate_scores'), ('replace', 'replace'), ('pivot', 'pivot'), ('save_dataframe', 'save_dataframe')])\n", "INFO:ekorpkit.base:Applying pipe: functools.partial()\n", "INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...\n", "INFO:ekorpkit.base:Calling load_candidates\n", "INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']\n", "INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']\n", "INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet\n", "INFO:ekorpkit.models.ngram.ngram:loaded 58142 candidates\n", "INFO:ekorpkit.models.sentiment.lbsa:Predicting sentiments of the column [text] using predict_sentence\n", "INFO:ekorpkit.base:Using batcher with minibatch size: 1000\n", "INFO:ekorpkit.utils.batch.batcher: backend: joblib minibatch_size: 1000 procs: 100 input_split: False merge_output: True len(data): 653468 len(args): 5\n", "Predicting [text]: 100%|██████████| 654/654 [03:06<00:00, 3.51it/s]\n", "INFO:ekorpkit.models.sentiment.lbsa: >> elapsed time to predict: 0:03:07.279688\n", "INFO:ekorpkit.pipelines.pipe:Saving data to: {'file': None, 'filename': 'fomc_sent_sentiments.parquet', 'base_dir': '../data/fomc', 'filetype': '', 'columns': None, 'suffix': None, 'filepath': '../data/fomc/fomc_sent_sentiments.parquet'}\n", "INFO:ekorpkit.io.file:Saving dataframe to ../data/fomc/fomc_sent_sentiments.parquet\n", "INFO:ekorpkit.base:Applying pipe: functools.partial()\n", "INFO:ekorpkit.base:instantiating ekorpkit.models.sentiment.base.BaseSentimentAnalyser...\n", "INFO:ekorpkit.pipelines.pipe:filename not specified\n", "INFO:ekorpkit.base:Applying pipe: functools.partial()\n", "INFO:ekorpkit.pipelines.pipe:processing column: content_type\n", "INFO:ekorpkit.pipelines.pipe: >> elapsed time to replace: 0:00:00.003644\n", "INFO:ekorpkit.base:Applying pipe: functools.partial()\n", "INFO:ekorpkit.base:Applying pipe: functools.partial()\n", "INFO:ekorpkit.io.file:Saving dataframe to ../data/fomc/fomc_sentiment_data.parquet\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
polarity_mean_beigebookpolarity_mean_meeting_scriptpolarity_mean_minutespolarity_mean_press_confpolarity_mean_speechpolarity_mean_statementpolarity_mean_testimonypolarity_diffusion_beigebookpolarity_diffusion_meeting_scriptpolarity_diffusion_minutes...num_tokens_sum_speechnum_tokens_sum_statementnum_tokens_sum_testimonynum_tokens_mean_beigebooknum_tokens_mean_meeting_scriptnum_tokens_mean_minutesnum_tokens_mean_press_confnum_tokens_mean_speechnum_tokens_mean_statementnum_tokens_mean_testimony
recent_meeting
1990-02-07NaN-0.087583NaNNaNNaNNaNNaNNaN-0.095663NaN...NaNNaNNaNNaN30.213010NaNNaNNaNNaNNaN
1990-03-27NaN-0.171992NaNNaNNaNNaNNaNNaN-0.179702NaN...NaNNaNNaNNaN29.846369NaNNaNNaNNaNNaN
1990-05-15NaN-0.116052NaNNaNNaNNaNNaNNaN-0.125461NaN...NaNNaNNaNNaN29.749077NaNNaNNaNNaNNaN
1990-07-03NaN-0.114829NaNNaNNaNNaNNaNNaN-0.117794NaN...NaNNaNNaNNaN29.667920NaNNaNNaNNaNNaN
1990-08-21NaN-0.209552NaNNaNNaNNaNNaNNaN-0.219403NaN...NaNNaNNaNNaN31.032836NaNNaNNaNNaNNaN
..................................................................
2021-06-160.011743NaN0.041638-0.017544-0.0317860.4358970.2359550.012195NaN0.031142...6894.0384.02522.021.413279NaN30.61591727.10000029.08860829.53846228.337079
2021-07-28-0.120547NaN-0.0439690.021318-0.0429410.461538NaN-0.134921NaN-0.069079...13170.0399.0NaN20.980159NaN32.88815827.26162829.07284830.692308NaN
2021-09-22-0.074328NaN-0.079199-0.087292-0.1338370.4761900.031746-0.075712NaN-0.112403...31138.0419.01026.022.957808NaN31.34883725.80938429.43100229.92857124.428571
2021-11-03-0.046022NaN-0.064255-0.089881-0.0303450.215686-0.120000-0.048109NaN-0.080851...26096.0538.0556.022.539497NaN31.88085127.72023831.98039231.64705927.800000
2021-12-15NaNNaN-0.043929-0.075441-0.3566130.166667NaNNaNNaN-0.064286...3694.0489.0NaNNaNNaN30.52142937.58741329.31746027.166667NaN
\n", "

286 rows × 35 columns

\n", "
" ], "text/plain": [ " polarity_mean_beigebook polarity_mean_meeting_script \\\n", "recent_meeting \n", "1990-02-07 NaN -0.087583 \n", "1990-03-27 NaN -0.171992 \n", "1990-05-15 NaN -0.116052 \n", "1990-07-03 NaN -0.114829 \n", "1990-08-21 NaN -0.209552 \n", "... ... ... \n", "2021-06-16 0.011743 NaN \n", "2021-07-28 -0.120547 NaN \n", "2021-09-22 -0.074328 NaN \n", "2021-11-03 -0.046022 NaN \n", "2021-12-15 NaN NaN \n", "\n", " polarity_mean_minutes polarity_mean_press_conf \\\n", "recent_meeting \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-06-16 0.041638 -0.017544 \n", "2021-07-28 -0.043969 0.021318 \n", "2021-09-22 -0.079199 -0.087292 \n", "2021-11-03 -0.064255 -0.089881 \n", "2021-12-15 -0.043929 -0.075441 \n", "\n", " polarity_mean_speech polarity_mean_statement \\\n", "recent_meeting \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-06-16 -0.031786 0.435897 \n", "2021-07-28 -0.042941 0.461538 \n", "2021-09-22 -0.133837 0.476190 \n", "2021-11-03 -0.030345 0.215686 \n", "2021-12-15 -0.356613 0.166667 \n", "\n", " polarity_mean_testimony polarity_diffusion_beigebook \\\n", "recent_meeting \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-06-16 0.235955 0.012195 \n", "2021-07-28 NaN -0.134921 \n", "2021-09-22 0.031746 -0.075712 \n", "2021-11-03 -0.120000 -0.048109 \n", "2021-12-15 NaN NaN \n", "\n", " polarity_diffusion_meeting_script polarity_diffusion_minutes \\\n", "recent_meeting \n", "1990-02-07 -0.095663 NaN \n", "1990-03-27 -0.179702 NaN \n", "1990-05-15 -0.125461 NaN \n", "1990-07-03 -0.117794 NaN \n", "1990-08-21 -0.219403 NaN \n", "... ... ... \n", "2021-06-16 NaN 0.031142 \n", "2021-07-28 NaN -0.069079 \n", "2021-09-22 NaN -0.112403 \n", "2021-11-03 NaN -0.080851 \n", "2021-12-15 NaN -0.064286 \n", "\n", " ... num_tokens_sum_speech num_tokens_sum_statement \\\n", "recent_meeting ... \n", "1990-02-07 ... NaN NaN \n", "1990-03-27 ... NaN NaN \n", "1990-05-15 ... NaN NaN \n", "1990-07-03 ... NaN NaN \n", "1990-08-21 ... NaN NaN \n", "... ... ... ... \n", "2021-06-16 ... 6894.0 384.0 \n", "2021-07-28 ... 13170.0 399.0 \n", "2021-09-22 ... 31138.0 419.0 \n", "2021-11-03 ... 26096.0 538.0 \n", "2021-12-15 ... 3694.0 489.0 \n", "\n", " num_tokens_sum_testimony num_tokens_mean_beigebook \\\n", "recent_meeting \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-06-16 2522.0 21.413279 \n", "2021-07-28 NaN 20.980159 \n", "2021-09-22 1026.0 22.957808 \n", "2021-11-03 556.0 22.539497 \n", "2021-12-15 NaN NaN \n", "\n", " num_tokens_mean_meeting_script num_tokens_mean_minutes \\\n", "recent_meeting \n", "1990-02-07 30.213010 NaN \n", "1990-03-27 29.846369 NaN \n", "1990-05-15 29.749077 NaN \n", "1990-07-03 29.667920 NaN \n", "1990-08-21 31.032836 NaN \n", "... ... ... \n", "2021-06-16 NaN 30.615917 \n", "2021-07-28 NaN 32.888158 \n", "2021-09-22 NaN 31.348837 \n", "2021-11-03 NaN 31.880851 \n", "2021-12-15 NaN 30.521429 \n", "\n", " num_tokens_mean_press_conf num_tokens_mean_speech \\\n", "recent_meeting \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-06-16 27.100000 29.088608 \n", "2021-07-28 27.261628 29.072848 \n", "2021-09-22 25.809384 29.431002 \n", "2021-11-03 27.720238 31.980392 \n", "2021-12-15 37.587413 29.317460 \n", "\n", " num_tokens_mean_statement num_tokens_mean_testimony \n", "recent_meeting \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-06-16 29.538462 28.337079 \n", "2021-07-28 30.692308 NaN \n", "2021-09-22 29.928571 24.428571 \n", "2021-11-03 31.647059 27.800000 \n", "2021-12-15 27.166667 NaN \n", "\n", "[286 rows x 35 columns]" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_cfg = eKonf.compose('model/sentiment=lm')\n", "\n", "cfg = eKonf.compose('pipeline')\n", "cfg._pipeline_ = ['predict', 'aggregate_scores', 'replace', 'pivot', 'save_dataframe']\n", "cfg.num_workers = 100\n", "cfg.data.data_file = \"fomc_sents.parquet\"\n", "cfg.data.data_dir = data_dir\n", "cfg.predict.model = model_cfg\n", "cfg.predict.path.output.base_dir = data_dir\n", "cfg.predict.path.output.filename = \"fomc_sent_sentiments.parquet\"\n", "cfg.aggregate_scores.groupby = ['content_type', 'date']\n", "cfg.replace.apply_to = 'content_type'\n", "cfg.replace.rcParams.to_replace = {'fomc_': ''}\n", "cfg.replace.rcParams.regex = True\n", "cfg.pivot.index = 'date'\n", "cfg.pivot.columns = 'content_type'\n", "cfg.pivot.values = ['polarity_mean', 'polarity_diffusion', 'num_examples', 'num_tokens_sum', 'num_tokens_mean']\n", "cfg.save_dataframe.output_dir = data_dir\n", "cfg.save_dataframe.output_file = 'fomc_tone_data_lm.parquet'\n", "tone_data_lm = eKonf.instantiate(cfg)\n", "tone_data_lm = eKonf.to_datetime(tone_data_lm, _columns='date')\n", "tone_data_lm = tone_data_lm.set_index('date')\n", "eKonf.save_data(tone_data_lm, 'fomc_tone_data_lm.parquet', data_dir)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "cfg = eKonf.compose('pipeline')\n", "cfg._pipeline_ = ['aggregate_scores', 'replace', 'pivot', 'save_dataframe']\n", "cfg.num_workers = 100\n", "cfg.data.data_file = \"fomc_sent_sentiments.parquet\"\n", "cfg.data.data_dir = data_dir\n", "cfg.aggregate_scores.groupby = ['content_type', 'date']\n", "cfg.replace.apply_to = 'content_type'\n", "cfg.replace.rcParams.to_replace = {'fomc_': ''}\n", "cfg.replace.rcParams.regex = True\n", "cfg.pivot.index = 'date'\n", "cfg.pivot.columns = 'content_type'\n", "cfg.pivot.values = ['polarity_mean', 'polarity_diffusion', 'num_examples', 'num_tokens_sum', 'num_tokens_mean']\n", "cfg.save_dataframe.output_dir = data_dir\n", "cfg.save_dataframe.output_file = 'fomc_tone_data_lm.parquet'\n", "tone_data_lm = eKonf.instantiate(cfg)\n", "tone_data_lm = eKonf.to_datetime(tone_data_lm, _columns='date')\n", "tone_data_lm = tone_data_lm.set_index('date')\n", "eKonf.save_data(tone_data_lm, 'fomc_tone_data_lm.parquet', data_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiments with the finbert" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3e7f3be465f449c7a3387b3a8411c554", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/452 [00:00" ] }, "metadata": { "image/png": { "height": 331, "width": 637 } }, "output_type": "display_data" } ], "source": [ "ds_cfg = eKonf.compose('dataset')\n", "ds_cfg.name = 'financial_phrasebank'\n", "ds_cfg.path.cache.uri = 'https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/financial_phrasebank.zip'\n", "ds_cfg.data_dir = ds_cfg.path.cached_path\n", "ds_cfg.verbose = False\n", "\n", "overrides=[\n", " '+model/transformer=classification',\n", " '+model/transformer/pretrained=finbert',\n", "]\n", "model_cfg = eKonf.compose('model/transformer=classification', overrides)\n", "model_cfg.name = 'fomc_finbert'\n", "model_cfg.dataset = ds_cfg\n", "model_cfg.verbose = False\n", "model_cfg.config.num_train_epochs = 2\n", "model_cfg.config.max_seq_length = 256\n", "model_cfg.config.train_batch_size = 32\n", "model_cfg.config.eval_batch_size = 32\n", "model_cfg._method_ = ['eval']\n", "# model_cfg._method_ = ['train', 'eval']\n", "finbert_model = eKonf.instantiate(model_cfg)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1dd484982db14ccfb46faac97c75c2ed", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/653468 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtextsplittimestampcontent_typedatespeakertitledecisionrate...recent_ratenext_meetingnext_decisionnext_ratetext_num_wordssection_idsent_idpred_labelsraw_predspred_probs
00The Secretary reported that advices of the ele...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...3.01993-02-180.03.047290neutral[2.2915966510772705, -0.9586986899375916, -2.4...0.955035
10By unanimous vote, the Committee elected the f...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...3.01993-02-180.03.073350neutral[1.692587971687317, -0.2049560397863388, -2.53...0.858684
20By unanimous vote, William J. McDonough, Marga...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...3.01993-02-180.03.074370neutral[1.8892569541931152, -0.3972317576408386, -2.6...0.898655
30On January 15, 1993, the continuing rules, reg...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...3.01993-02-180.03.059390neutral[2.335063934326172, -1.0927255153656006, -2.45...0.960805
40Members were asked to indicate if they wished ...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...3.01993-02-180.03.025391neutral[2.3842966556549072, -1.327033519744873, -2.36...0.967973
\n", "

5 rows × 22 columns

\n", "" ], "text/plain": [ " id text split timestamp \\\n", "0 0 The Secretary reported that advices of the ele... train 1993-02-03 \n", "1 0 By unanimous vote, the Committee elected the f... train 1993-02-03 \n", "2 0 By unanimous vote, William J. McDonough, Marga... train 1993-02-03 \n", "3 0 On January 15, 1993, the continuing rules, reg... train 1993-02-03 \n", "4 0 Members were asked to indicate if they wished ... train 1993-02-03 \n", "\n", " content_type date speaker title decision \\\n", "0 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "1 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "2 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "3 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "4 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "\n", " rate ... recent_rate next_meeting next_decision next_rate \\\n", "0 3.0 ... 3.0 1993-02-18 0.0 3.0 \n", "1 3.0 ... 3.0 1993-02-18 0.0 3.0 \n", "2 3.0 ... 3.0 1993-02-18 0.0 3.0 \n", "3 3.0 ... 3.0 1993-02-18 0.0 3.0 \n", "4 3.0 ... 3.0 1993-02-18 0.0 3.0 \n", "\n", " text_num_words section_id sent_id pred_labels \\\n", "0 47 29 0 neutral \n", "1 73 35 0 neutral \n", "2 74 37 0 neutral \n", "3 59 39 0 neutral \n", "4 25 39 1 neutral \n", "\n", " raw_preds pred_probs \n", "0 [2.2915966510772705, -0.9586986899375916, -2.4... 0.955035 \n", "1 [1.692587971687317, -0.2049560397863388, -2.53... 0.858684 \n", "2 [1.8892569541931152, -0.3972317576408386, -2.6... 0.898655 \n", "3 [2.335063934326172, -1.0927255153656006, -2.45... 0.960805 \n", "4 [2.3842966556549072, -1.327033519744873, -2.36... 0.967973 \n", "\n", "[5 rows x 22 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_cfg._method_ = []\n", "cfg = eKonf.compose(config_group='pipeline')\n", "cfg.name = 'fomc_sent_sentiments'\n", "cfg.data_dir = data_dir\n", "cfg.data_file = \"fomc_sents.parquet\"\n", "cfg._pipeline_ = ['predict']\n", "cfg.predict.model = model_cfg\n", "cfg.predict.output_dir = data_dir\n", "cfg.predict.output_file = f'{cfg.name}_finbert.parquet'\n", "fomc_sent_sentiments_finbert = eKonf.instantiate(cfg)\n", "fomc_sent_sentiments_finbert.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtextsplittimestampcontent_typedatespeakertitledecisionrate...recent_ratenext_meetingnext_decisionnext_ratetext_num_wordssection_idsent_idpred_labelsraw_predspred_probs
00The Secretary reported that advices of the ele...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...3.001993-02-180.03.0047290neutral[2.2915966510772705, -0.9586986899375916, -2.4...0.955035
10By unanimous vote, the Committee elected the f...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...3.001993-02-180.03.0073350neutral[1.692587971687317, -0.2049560397863388, -2.53...0.858684
20By unanimous vote, William J. McDonough, Marga...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...3.001993-02-180.03.0074370neutral[1.8892569541931152, -0.3972317576408386, -2.6...0.898655
30On January 15, 1993, the continuing rules, reg...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...3.001993-02-180.03.0059390neutral[2.335063934326172, -1.0927255153656006, -2.45...0.960805
40Members were asked to indicate if they wished ...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.00...3.001993-02-180.03.0025391neutral[2.3842966556549072, -1.327033519744873, -2.36...0.967973
..................................................................
6534632854It will not have the word “somewhat” on line 3.train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.25...0.252015-01-280.00.25102872neutral[2.3733553886413574, -1.3893874883651733, -2.2...0.967620
6534642854Chair Yellen Yes Vice Chairman Dudley ...train2014-12-17fomc_meeting_script2014-12-17MR. LUECKEFOMC Meeting Transcript0.00.25...0.252015-01-280.00.25312874neutral[2.224238872528076, -0.893900990486145, -2.421...0.948905
6534652854And let me confirm that the next meeting will ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...0.252015-01-280.00.25192883neutral[2.349479913711548, -1.391977071762085, -2.298...0.967770
6534662854I believe box lunches are now available for pe...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...0.252015-01-280.00.25332884neutral[2.311434030532837, -1.1415460109710693, -2.37...0.960746
6534672854I will do my best, and I will consider at the ...train2014-12-17fomc_meeting_script2014-12-17CHAIR YELLENFOMC Meeting Transcript0.00.25...0.252015-01-280.00.25182885neutral[2.3932976722717285, -1.2607381343841553, -2.4...0.967106
\n", "

653468 rows × 22 columns

\n", "
" ], "text/plain": [ " id text split \\\n", "0 0 The Secretary reported that advices of the ele... train \n", "1 0 By unanimous vote, the Committee elected the f... train \n", "2 0 By unanimous vote, William J. McDonough, Marga... train \n", "3 0 On January 15, 1993, the continuing rules, reg... train \n", "4 0 Members were asked to indicate if they wished ... train \n", "... ... ... ... \n", "653463 2854 It will not have the word “somewhat” on line 3. train \n", "653464 2854 Chair Yellen Yes Vice Chairman Dudley ... train \n", "653465 2854 And let me confirm that the next meeting will ... train \n", "653466 2854 I believe box lunches are now available for pe... train \n", "653467 2854 I will do my best, and I will consider at the ... train \n", "\n", " timestamp content_type date speaker \\\n", "0 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "1 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "2 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "3 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "4 1993-02-03 fomc_minutes 1993-02-03 Alan Greenspan \n", "... ... ... ... ... \n", "653463 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653464 2014-12-17 fomc_meeting_script 2014-12-17 MR. LUECKE \n", "653465 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653466 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "653467 2014-12-17 fomc_meeting_script 2014-12-17 CHAIR YELLEN \n", "\n", " title decision rate ... recent_rate \\\n", "0 FOMC Meeting Minutes 0.0 3.00 ... 3.00 \n", "1 FOMC Meeting Minutes 0.0 3.00 ... 3.00 \n", "2 FOMC Meeting Minutes 0.0 3.00 ... 3.00 \n", "3 FOMC Meeting Minutes 0.0 3.00 ... 3.00 \n", "4 FOMC Meeting Minutes 0.0 3.00 ... 3.00 \n", "... ... ... ... ... ... \n", "653463 FOMC Meeting Transcript 0.0 0.25 ... 0.25 \n", "653464 FOMC Meeting Transcript 0.0 0.25 ... 0.25 \n", "653465 FOMC Meeting Transcript 0.0 0.25 ... 0.25 \n", "653466 FOMC Meeting Transcript 0.0 0.25 ... 0.25 \n", "653467 FOMC Meeting Transcript 0.0 0.25 ... 0.25 \n", "\n", " next_meeting next_decision next_rate text_num_words section_id \\\n", "0 1993-02-18 0.0 3.00 47 29 \n", "1 1993-02-18 0.0 3.00 73 35 \n", "2 1993-02-18 0.0 3.00 74 37 \n", "3 1993-02-18 0.0 3.00 59 39 \n", "4 1993-02-18 0.0 3.00 25 39 \n", "... ... ... ... ... ... \n", "653463 2015-01-28 0.0 0.25 10 287 \n", "653464 2015-01-28 0.0 0.25 31 287 \n", "653465 2015-01-28 0.0 0.25 19 288 \n", "653466 2015-01-28 0.0 0.25 33 288 \n", "653467 2015-01-28 0.0 0.25 18 288 \n", "\n", " sent_id pred_labels \\\n", "0 0 neutral \n", "1 0 neutral \n", "2 0 neutral \n", "3 0 neutral \n", "4 1 neutral \n", "... ... ... \n", "653463 2 neutral \n", "653464 4 neutral \n", "653465 3 neutral \n", "653466 4 neutral \n", "653467 5 neutral \n", "\n", " raw_preds pred_probs \n", "0 [2.2915966510772705, -0.9586986899375916, -2.4... 0.955035 \n", "1 [1.692587971687317, -0.2049560397863388, -2.53... 0.858684 \n", "2 [1.8892569541931152, -0.3972317576408386, -2.6... 0.898655 \n", "3 [2.335063934326172, -1.0927255153656006, -2.45... 0.960805 \n", "4 [2.3842966556549072, -1.327033519744873, -2.36... 0.967973 \n", "... ... ... \n", "653463 [2.3733553886413574, -1.3893874883651733, -2.2... 0.967620 \n", "653464 [2.224238872528076, -0.893900990486145, -2.421... 0.948905 \n", "653465 [2.349479913711548, -1.391977071762085, -2.298... 0.967770 \n", "653466 [2.311434030532837, -1.1415460109710693, -2.37... 0.960746 \n", "653467 [2.3932976722717285, -1.2607381343841553, -2.4... 0.967106 \n", "\n", "[653468 rows x 22 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fomc_sent_sentiments_finbert = eKonf.load_data('fomc_sent_sentiments_finbert.parquet', data_dir)\n", "fomc_sent_sentiments_finbert" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "cfg = eKonf.compose('pipeline')\n", "cfg._pipeline_ = ['aggregate_scores', 'replace', 'pivot', 'save_dataframe']\n", "cfg.num_workers = 100\n", "cfg.data.data_file = \"fomc_sent_sentiments_finbert.parquet\"\n", "cfg.data.data_dir = data_dir\n", "cfg.aggregate_scores.groupby = ['content_type', 'date']\n", "cfg.aggregate_scores._method_ = 'classification'\n", "cfg.replace.apply_to = 'content_type'\n", "cfg.replace.rcParams.to_replace = {'fomc_': ''}\n", "cfg.replace.rcParams.regex = True\n", "cfg.pivot.index = 'date'\n", "cfg.pivot.columns = 'content_type'\n", "cfg.pivot.values = ['polarity_mean', 'polarity_diffusion', 'num_examples']\n", "cfg.save_dataframe.output_dir = data_dir\n", "cfg.save_dataframe.output_file = 'fomc_sentiment_finbert_next.parquet'\n", "tone_data_finbert = eKonf.instantiate(cfg)\n", "tone_data_finbert = eKonf.to_datetime(tone_data_finbert, _columns='date')\n", "tone_data_finbert = tone_data_finbert.set_index('date')\n", "eKonf.save_data(tone_data_finbert, 'fomc_tone_data_finbert.parquet', data_dir)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
finbert_mean_minutesfinbert_mean_press_conffinbert_mean_speechfinbert_mean_statementfinbert_diffusion_minutesfinbert_diffusion_press_conffinbert_diffusion_speechfinbert_diffusion_statement
date
1990-02-07NaNNaNNaNNaNNaNNaNNaNNaN
1990-03-27NaNNaNNaNNaNNaNNaNNaNNaN
1990-05-15NaNNaNNaNNaNNaNNaNNaNNaN
1990-07-03NaNNaNNaNNaNNaNNaNNaNNaN
1990-08-21NaNNaNNaNNaNNaNNaNNaNNaN
...........................
2021-11-30NaNNaN0.182338NaNNaNNaN0.239583NaN
2021-12-01NaNNaNNaNNaNNaNNaNNaNNaN
2021-12-02NaNNaN0.262141NaNNaNNaN0.338889NaN
2021-12-150.5098060.280516NaN0.4129470.6750.377622NaN0.555556
2021-12-17NaNNaN0.408242NaNNaNNaN0.547619NaN
\n", "

1876 rows × 8 columns

\n", "
" ], "text/plain": [ " finbert_mean_minutes finbert_mean_press_conf \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 0.509806 0.280516 \n", "2021-12-17 NaN NaN \n", "\n", " finbert_mean_speech finbert_mean_statement \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 0.182338 NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 0.262141 NaN \n", "2021-12-15 NaN 0.412947 \n", "2021-12-17 0.408242 NaN \n", "\n", " finbert_diffusion_minutes finbert_diffusion_press_conf \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 0.675 0.377622 \n", "2021-12-17 NaN NaN \n", "\n", " finbert_diffusion_speech finbert_diffusion_statement \n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 0.239583 NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 0.338889 NaN \n", "2021-12-15 NaN 0.555556 \n", "2021-12-17 0.547619 NaN \n", "\n", "[1876 rows x 8 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tone_data_finbert = eKonf.load_data('fomc_tone_data_finbert.parquet', data_dir)\n", "\n", "cols = [\n", " 'polarity_mean_minutes', 'polarity_mean_press_conf', 'polarity_mean_speech', 'polarity_mean_statement',\n", " 'polarity_diffusion_minutes', 'polarity_diffusion_press_conf', 'polarity_diffusion_speech', 'polarity_diffusion_statement',\n", "]\n", "\n", "tone_data_finbert = tone_data_finbert[cols].copy()\n", "tone_data_finbert.columns = tone_data_finbert.columns.str.replace('polarity', 'finbert')\n", "tone_data_finbert" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predict sentiments with the T5" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.8/site-packages/transformers/models/t5/tokenization_t5.py:164: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n", "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n", "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n", "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n", "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "424c711520fb441eb0aee942cdeaaf77", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1445 [00:00" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Tracking run with wandb version 0.12.19" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/fomc_t5/t5-base/wandb/run-20220701_020155-3fyvrvgf" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Syncing run wandering-moon-1 to Weights & Biases (docs)
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4c7725f3f17c4bd8ab5ae5ca28621841", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Running Epoch 0 of 2: 0%| | 0/181 [00:00" ] }, "metadata": { "image/png": { "height": 331, "width": 637 } }, "output_type": "display_data" } ], "source": [ "ds_cfg = eKonf.compose('dataset')\n", "ds_cfg.name = 'financial_phrasebank'\n", "ds_cfg.path.cache.uri = 'https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/financial_phrasebank.zip'\n", "ds_cfg.data_dir = ds_cfg.path.cached_path\n", "ds_cfg.verbose = False\n", "\n", "overrides=[\n", " '+model/transformer=t5_classification_with_simple',\n", " '+model/transformer/pretrained=t5-base',\n", "]\n", "model_cfg = eKonf.compose('model/transformer=t5_classification_with_simple', overrides)\n", "model_cfg.name = 'fomc_t5'\n", "model_cfg.dataset = ds_cfg\n", "model_cfg.verbose = False\n", "model_cfg.config.num_train_epochs = 2\n", "model_cfg.config.max_seq_length = 256\n", "model_cfg.config.train_batch_size = 8\n", "model_cfg.config.eval_batch_size = 8\n", "model_cfg._method_ = ['train', 'eval']\n", "# model_cfg._method_ = ['eval']\n", "t5_model = eKonf.instantiate(model_cfg)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8e778d63986e4e3f97ac43b2cd397a55", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating outputs: 0%| | 0/81684 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtextsplittimestampcontent_typedatespeakertitledecisionrate...recent_decisionrecent_ratenext_meetingnext_decisionnext_ratetext_num_wordssection_idsent_idprefixpred_labels
00The Secretary reported that advices of the ele...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...0.03.01993-02-180.03.047290classificationneutral
10By unanimous vote, the Committee elected the f...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...0.03.01993-02-180.03.073350classificationneutral
20By unanimous vote, William J. McDonough, Marga...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...0.03.01993-02-180.03.074370classificationneutral
30On January 15, 1993, the continuing rules, reg...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...0.03.01993-02-180.03.059390classificationneutral
40Members were asked to indicate if they wished ...train1993-02-03fomc_minutes1993-02-03Alan GreenspanFOMC Meeting Minutes0.03.0...0.03.01993-02-180.03.025391classificationneutral
\n", "

5 rows × 21 columns

\n", "" ], "text/plain": [ " id text split timestamp \\\n", "0 0 The Secretary reported that advices of the ele... train 1993-02-03 \n", "1 0 By unanimous vote, the Committee elected the f... train 1993-02-03 \n", "2 0 By unanimous vote, William J. McDonough, Marga... train 1993-02-03 \n", "3 0 On January 15, 1993, the continuing rules, reg... train 1993-02-03 \n", "4 0 Members were asked to indicate if they wished ... train 1993-02-03 \n", "\n", " content_type date speaker title decision \\\n", "0 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "1 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "2 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "3 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "4 fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes 0.0 \n", "\n", " rate ... recent_decision recent_rate next_meeting next_decision \\\n", "0 3.0 ... 0.0 3.0 1993-02-18 0.0 \n", "1 3.0 ... 0.0 3.0 1993-02-18 0.0 \n", "2 3.0 ... 0.0 3.0 1993-02-18 0.0 \n", "3 3.0 ... 0.0 3.0 1993-02-18 0.0 \n", "4 3.0 ... 0.0 3.0 1993-02-18 0.0 \n", "\n", " next_rate text_num_words section_id sent_id prefix pred_labels \n", "0 3.0 47 29 0 classification neutral \n", "1 3.0 73 35 0 classification neutral \n", "2 3.0 74 37 0 classification neutral \n", "3 3.0 59 39 0 classification neutral \n", "4 3.0 25 39 1 classification neutral \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_cfg._method_ = []\n", "cfg = eKonf.compose(config_group='pipeline')\n", "cfg.name = 'fomc_sent_sentiments'\n", "cfg.data_dir = data_dir\n", "cfg.data_file = \"fomc_sents.parquet\"\n", "cfg._pipeline_ = ['predict']\n", "cfg.predict.model = model_cfg\n", "cfg.predict.output_dir = data_dir\n", "cfg.predict.output_file = f'{cfg.name}_t5.parquet'\n", "fomc_sent_sentiments_t5 = eKonf.instantiate(cfg)\n", "fomc_sent_sentiments_t5.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "cfg = eKonf.compose('pipeline')\n", "cfg._pipeline_ = ['aggregate_scores', 'replace', 'pivot', 'save_dataframe']\n", "cfg.num_workers = 100\n", "cfg.data.data_file = \"fomc_sent_sentiments_t5.parquet\"\n", "cfg.data.data_dir = data_dir\n", "cfg.aggregate_scores.groupby = ['content_type', 'date']\n", "cfg.aggregate_scores._method_ = 'classification_t5'\n", "cfg.replace.apply_to = 'content_type'\n", "cfg.replace.rcParams.to_replace = {'fomc_': ''}\n", "cfg.replace.rcParams.regex = True\n", "cfg.pivot.index = 'date'\n", "cfg.pivot.columns = 'content_type'\n", "cfg.pivot.values = ['polarity_diffusion', 'num_examples']\n", "cfg.save_dataframe.output_dir = data_dir\n", "cfg.save_dataframe.output_file = 'fomc_sentiment_t5_next.parquet'\n", "tone_data_t5 = eKonf.instantiate(cfg)\n", "tone_data_t5 = eKonf.to_datetime(tone_data_t5, _columns='date')\n", "tone_data_t5 = tone_data_t5.set_index('date')\n", "eKonf.save_data(tone_data_t5, 'fomc_tone_data_t5.parquet', data_dir)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
t5_diffusion_minutest5_diffusion_press_conft5_diffusion_speecht5_diffusion_statement
date
1990-02-07NaNNaNNaNNaN
1990-03-27NaNNaNNaNNaN
1990-05-15NaNNaNNaNNaN
1990-07-03NaNNaNNaNNaN
1990-08-21NaNNaNNaNNaN
...............
2021-11-30NaNNaN0.239583NaN
2021-12-01NaNNaNNaNNaN
2021-12-02NaNNaN0.250000NaN
2021-12-150.4035710.216783NaN0.444444
2021-12-17NaNNaN0.174603NaN
\n", "

1876 rows × 4 columns

\n", "
" ], "text/plain": [ " t5_diffusion_minutes t5_diffusion_press_conf \\\n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 NaN NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 NaN NaN \n", "2021-12-15 0.403571 0.216783 \n", "2021-12-17 NaN NaN \n", "\n", " t5_diffusion_speech t5_diffusion_statement \n", "date \n", "1990-02-07 NaN NaN \n", "1990-03-27 NaN NaN \n", "1990-05-15 NaN NaN \n", "1990-07-03 NaN NaN \n", "1990-08-21 NaN NaN \n", "... ... ... \n", "2021-11-30 0.239583 NaN \n", "2021-12-01 NaN NaN \n", "2021-12-02 0.250000 NaN \n", "2021-12-15 NaN 0.444444 \n", "2021-12-17 0.174603 NaN \n", "\n", "[1876 rows x 4 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tone_data_t5 = eKonf.load_data('fomc_tone_data_t5.parquet', data_dir)\n", "\n", "cols = [\n", " 'polarity_diffusion_minutes', 'polarity_diffusion_press_conf', 'polarity_diffusion_speech', 'polarity_diffusion_statement',\n", "]\n", "\n", "tone_data_t5 = tone_data_t5[cols].copy()\n", "tone_data_t5.columns = tone_data_t5.columns.str.replace('polarity', 't5')\n", "tone_data_t5\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "f869af7787e6a1c49e09e367fc6e1b81d93d1c6583b43249c80edc047bd13cb2" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 2 }