{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "cjgz1WTcPWee" }, "source": [ "# Instantiating pipeline" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:ekorpkit.base:IPython version: (6, 9, 0), client: jupyter_client\n", "INFO:ekorpkit.base:Google Colab not detected.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "version: 0.1.33+21.gf33ff55.dirty\n", "is notebook? True\n", "is colab? False\n", "evironment varialbles:\n", "{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',\n", " 'EKORPKIT_DATA_DIR': None,\n", " 'EKORPKIT_PROJECT': 'ekorpkit-book',\n", " 'EKORPKIT_WORKSPACE_ROOT': '/workspace',\n", " 'NUM_WORKERS': 230}\n" ] } ], "source": [ "import logging\n", "from ekorpkit import eKonf\n", "\n", "logging.basicConfig(level=logging.INFO)\n", "print(\"version:\", eKonf.__version__)\n", "print(\"is notebook?\", eKonf.is_notebook())\n", "print(\"is colab?\", eKonf.is_colab())\n", "print(\"evironment varialbles:\")\n", "eKonf.print(eKonf.env().dict())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date\n", "INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted\n", "INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date\n", "INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted\n", "INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date\n", "INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted\n", "INFO:ekorpkit.io.file:Processing [1] files from ['edgar.parquet']\n", "INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet']\n", "INFO:ekorpkit.io.file:Loading data from /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Data: edgar.parquet\n" ] } ], "source": [ "cfg = eKonf.compose(\"data\")\n", "cfg.path.cache.uri = \"https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip\"\n", "cfg.data_dir = cfg.path.cached_path\n", "cfg.data_dir += \"/edgar\"\n", "cfg.data_file = \"edgar.parquet\"\n", "data = eKonf.instantiate(cfg)\n", "print(data)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:ekorpkit.pipelines.pipe:No pipeline specified\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfilenameitemtextcikcompanyfiling_typefiling_dateperiod_of_reportsicstate_of_incstate_locationfiscal_year_endfiling_html_indexhtm_filing_linkcomplete_text_filing_link
141015341999/320193_10K_1999_0000912057-99-010244.jsonitem_1ITEM 1. \\nBUSINESS GENERAL Apple Computer, Inc...320193APPLE COMPUTER INC10-K1999-12-221999-09-253571CACA0930https://www.sec.gov/Archives/edgar/data/320193...Nonehttps://www.sec.gov/Archives/edgar/data/320193...
156016971999/21344_10K_1999_0000021344-00-000009.jsonitem_1ITEM 1. \\nBUSINESS The Coca-Cola Company (toge...21344COCA COLA CO10-K2000-03-091999-12-312080DEGA1231https://www.sec.gov/Archives/edgar/data/21344/...Nonehttps://www.sec.gov/Archives/edgar/data/21344/...
274629771999/70858_10K_1999_0000950168-00-000621.jsonitem_1Item 1. \\nBUSINESS General Bank of America Cor...70858BANK OF AMERICA CORP /DE/10-K2000-03-201999-12-316021DENC1231https://www.sec.gov/Archives/edgar/data/70858/...Nonehttps://www.sec.gov/Archives/edgar/data/70858/...
376240881999/80424_10K_1999_0000080424-99-000027.jsonitem_1Item 1. \\nBusiness. \\n--------- General Develo...80424PROCTER & GAMBLE CO10-K1999-09-151999-06-302840OHOH0630https://www.sec.gov/Archives/edgar/data/80424/...Nonehttps://www.sec.gov/Archives/edgar/data/80424/...
480652111999/1018724_10K_1999_0000891020-00-000622.jsonitem_1ITEM 1. \\nBUSINESS This Annual Report on Form ...1018724AMAZON COM INC10-K2000-03-291999-12-315961DEWA1231https://www.sec.gov/Archives/edgar/data/101872...Nonehttps://www.sec.gov/Archives/edgar/data/101872...
\n", "
" ], "text/plain": [ " id filename item \\\n", "1410 1534 1999/320193_10K_1999_0000912057-99-010244.json item_1 \n", "1560 1697 1999/21344_10K_1999_0000021344-00-000009.json item_1 \n", "2746 2977 1999/70858_10K_1999_0000950168-00-000621.json item_1 \n", "3762 4088 1999/80424_10K_1999_0000080424-99-000027.json item_1 \n", "4806 5211 1999/1018724_10K_1999_0000891020-00-000622.json item_1 \n", "\n", " text cik \\\n", "1410 ITEM 1. \\nBUSINESS GENERAL Apple Computer, Inc... 320193 \n", "1560 ITEM 1. \\nBUSINESS The Coca-Cola Company (toge... 21344 \n", "2746 Item 1. \\nBUSINESS General Bank of America Cor... 70858 \n", "3762 Item 1. \\nBusiness. \\n--------- General Develo... 80424 \n", "4806 ITEM 1. \\nBUSINESS This Annual Report on Form ... 1018724 \n", "\n", " company filing_type filing_date period_of_report \\\n", "1410 APPLE COMPUTER INC 10-K 1999-12-22 1999-09-25 \n", "1560 COCA COLA CO 10-K 2000-03-09 1999-12-31 \n", "2746 BANK OF AMERICA CORP /DE/ 10-K 2000-03-20 1999-12-31 \n", "3762 PROCTER & GAMBLE CO 10-K 1999-09-15 1999-06-30 \n", "4806 AMAZON COM INC 10-K 2000-03-29 1999-12-31 \n", "\n", " sic state_of_inc state_location fiscal_year_end \\\n", "1410 3571 CA CA 0930 \n", "1560 2080 DE GA 1231 \n", "2746 6021 DE NC 1231 \n", "3762 2840 OH OH 0630 \n", "4806 5961 DE WA 1231 \n", "\n", " filing_html_index htm_filing_link \\\n", "1410 https://www.sec.gov/Archives/edgar/data/320193... None \n", "1560 https://www.sec.gov/Archives/edgar/data/21344/... None \n", "2746 https://www.sec.gov/Archives/edgar/data/70858/... None \n", "3762 https://www.sec.gov/Archives/edgar/data/80424/... None \n", "4806 https://www.sec.gov/Archives/edgar/data/101872... None \n", "\n", " complete_text_filing_link \n", "1410 https://www.sec.gov/Archives/edgar/data/320193... \n", "1560 https://www.sec.gov/Archives/edgar/data/21344/... \n", "2746 https://www.sec.gov/Archives/edgar/data/70858/... \n", "3762 https://www.sec.gov/Archives/edgar/data/80424/... \n", "4806 https://www.sec.gov/Archives/edgar/data/101872... " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_cfg = eKonf.compose(\"pipeline=blank\")\n", "df_cfg.name = \"edgar_sample\"\n", "df = eKonf.instantiate(df_cfg, data=data)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 898, "referenced_widgets": [ "54b9b04fc6a841b7bef4b6ff4de498db", "7ffb1a5a885a41fa87799861ce118719", "988fe402a83e4cb9ad6407ac00e1ad2d", "983abdf194ad47b5a1f8cee91953ebd9", "84d225df20434a15aea82e0c643591dd", "b89289a931914645adbfcff9a4d9e090", "12e4abc29d824a2598b168f27f6fa332", "72c74a02cd444d7ab5c9f889bc673530", "a76915f2193d4390ad0ec294bbdafac4", "1f58e284d19344448d5263c985563774", "cefa6c092c65415f87b8cf12ab7a8bf9", "2dcef62b95a442f0a4a8d2ce3ea45f40", "e57b71cafa4148cfb0d0ffd96f2ea6f0", "9b5b401f1d1641e49d0b0a35219ca373", "8e323d8366384155b0b47759cab4037b", "7f76f528ac4b444280ac7748e27527b0", "4a0d392123ea4532b35c2a596ff10cd5", "197d83957b8548718680766d669a9466", "3418c9bc2934429c891985b9a681b0c2", "dc9489cd9d824313856bee37a3fd748f", "271ec6c646de43bbbecfac2612762eac", "49d28c060d2b40cdaaae9bb499056f12", "7bd36908e80a4b6d82c2aedac7c86f21", "195b1bdfebfa4a92b4c1e6fbf35f6400", "eefc785b2aef4bb9bd3fc285132da1b2", "5b7f68a9050a411d80eef18e93932343", "86b251e10ca94822a86618a8f8cb47d9", "70df0d2cc0dc4f9585420ce982603a08", "95282f4c0d894c10aea0570db9a8677f", "788dcca72e56413ca73baa4e9c15faec", "4f9e681ef74048ebbfe43a736db57abb", "01c439f9da5743bcb45a69fa0481b0ff", "e23b80cb79a7419aa8d6e6c71bf838b7", "7f0fb0a74aa64f19a7d376d0f1a73a65", "63ae8f532b324e47b99f745c0bc72b80", "0e38b0916cb748e78c07e798af3f4a8a", "36bf876821b1424699297ec472e53f93", "fa24f243079d42638b7c2e217bd10f98", "eddcb8531db34faa99769eb9feeb8b26", "6d2bb6073faa49aea1683b32a97e35a6", "f071b75a512a444fa5fe2f38e27cf1bb", "0ca29e7c8af04bf294fccceb40cf3919", "3266f9f2edaa41a89b63b3999f72f100", "fdbacdf8605a4f5e9a5a6945b0014089", "3e1d610e24c043f384fa0f01b7d7ce82", "1e073bc0233f41f7ba7108505ea2757f", "305831734cbb42899f25c2e5b8c602df", "d50b283ff0f04ae6aecab1cd6b19c25d", "52a1a9a2bd4f4b0b82f86b5d05aeb914", "a90fd20b44a448798cc59768c4ff3e91", "788df7d70b66485b8f806fa0bfaf86df", "619355415717491588f8e2d39f303d7c", "d8619afe86614ec993c502a9bcf538d1", "49b89f9ba5c24ce3baa9e9ae25e63528", "52911a9b75684abb9f94d9ec8a070f89", "cd8ef93310224845bdcf8068c3abff56", "ca7983e74d614a5a8bf0ff2adcdf1318", "c0c6731f579c467cb8d1480025457b44", "225293ef6c994750972cad7a6632c438", "731cb2937015420ead9d9702256469a1", "1f56dd05bfd74607a8e243728b928b40", "441848a59fb2492fb3506481f4aed501", "f403eeda7ae24971a70f6faa62f0ae15", "6e8d25287ce747d1a085a01b753091d8", "64c54a1a251649d2baf8565d4881bb3f", "1c4357bbf9974a8fb51ce82bd9367622", "efea188cc0e444e78ee2fcf713a5e82c", "096d6ea9835d4be68ae4c4370b7907a0", "99152acbf2c1437ebcbecd3ec11b4043", "af179eb9b7c14e8799da05633ffb13b3", "8d372455ce584eac917e71de3c3aa686", "e19e880a67df40b4ae57c80dc1c0fe16", "264c896ca80e43d6972d477143e9f374", "349a521daae047789a9f863f01aca055", "80593bd9a5394da7bcbb8d8c720503da", "b4d8e388cf334093bd9b4ea0fda65060", "2687199a44db4f22b10cefd361eee5a6", "1d3b0d741ddb47f6832c4eecd8a7d276", "97c936b172634780823c7978cc9374af", "5f66d8ce192d4ecdb3da061e3b66e20c", "8bed667c5c234057ab578dcc0d9d0c28", "aae5fb1ff0b4456ab7530c204d215028", "c5d82feb6fa343c2a6a29ad40fb96d52", "2a2e7ad98976426dae9aaa146f6fe056", "de2f81d030b14e24a5a5fa23112ef2d0", "6e94654f47254f5ba13fcc0f10313f38", "75de53ad7cd84562875b36f0df877467", "e6301f07063a46d2a7032241de3d9a28" ] }, "id": "k9RZXQ0QcKOc", "outputId": "6d6446c9-c34a-4b42-be5a-ef0949bb2878" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date\n", "INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted\n", "INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date\n", "INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted\n", "INFO:cached_path:cache of https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip is up-to-date\n", "INFO:ekorpkit.io.cached_path:cached path: /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted\n", "INFO:ekorpkit.io.file:Processing [1] files from ['edgar.parquet']\n", "INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet']\n", "INFO:ekorpkit.io.file:Loading data from /workspace/.cache/cached_path/8c227c4424ceaa42eb7e3b82c158ea7c8ca6c27910f5f4b29c52d7376c610708.e471900481c811e176bb6ea493388979a9def601f2f456f660131000c602f18b-extracted/edgar/edgar.parquet\n", "WARNING:ekorpkit.pipelines.pipe:No pipeline specified\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idfilenameitemtextcikcompanyfiling_typefiling_dateperiod_of_reportsicstate_of_incstate_locationfiscal_year_endfiling_html_indexhtm_filing_linkcomplete_text_filing_link
141015341999/320193_10K_1999_0000912057-99-010244.jsonitem_1ITEM 1. \\nBUSINESS GENERAL Apple Computer, Inc...320193APPLE COMPUTER INC10-K1999-12-221999-09-253571CACA0930https://www.sec.gov/Archives/edgar/data/320193...Nonehttps://www.sec.gov/Archives/edgar/data/320193...
156016971999/21344_10K_1999_0000021344-00-000009.jsonitem_1ITEM 1. \\nBUSINESS The Coca-Cola Company (toge...21344COCA COLA CO10-K2000-03-091999-12-312080DEGA1231https://www.sec.gov/Archives/edgar/data/21344/...Nonehttps://www.sec.gov/Archives/edgar/data/21344/...
274629771999/70858_10K_1999_0000950168-00-000621.jsonitem_1Item 1. \\nBUSINESS General Bank of America Cor...70858BANK OF AMERICA CORP /DE/10-K2000-03-201999-12-316021DENC1231https://www.sec.gov/Archives/edgar/data/70858/...Nonehttps://www.sec.gov/Archives/edgar/data/70858/...
376240881999/80424_10K_1999_0000080424-99-000027.jsonitem_1Item 1. \\nBusiness. \\n--------- General Develo...80424PROCTER & GAMBLE CO10-K1999-09-151999-06-302840OHOH0630https://www.sec.gov/Archives/edgar/data/80424/...Nonehttps://www.sec.gov/Archives/edgar/data/80424/...
480652111999/1018724_10K_1999_0000891020-00-000622.jsonitem_1ITEM 1. \\nBUSINESS This Annual Report on Form ...1018724AMAZON COM INC10-K2000-03-291999-12-315961DEWA1231https://www.sec.gov/Archives/edgar/data/101872...Nonehttps://www.sec.gov/Archives/edgar/data/101872...
\n", "
" ], "text/plain": [ " id filename item \\\n", "1410 1534 1999/320193_10K_1999_0000912057-99-010244.json item_1 \n", "1560 1697 1999/21344_10K_1999_0000021344-00-000009.json item_1 \n", "2746 2977 1999/70858_10K_1999_0000950168-00-000621.json item_1 \n", "3762 4088 1999/80424_10K_1999_0000080424-99-000027.json item_1 \n", "4806 5211 1999/1018724_10K_1999_0000891020-00-000622.json item_1 \n", "\n", " text cik \\\n", "1410 ITEM 1. \\nBUSINESS GENERAL Apple Computer, Inc... 320193 \n", "1560 ITEM 1. \\nBUSINESS The Coca-Cola Company (toge... 21344 \n", "2746 Item 1. \\nBUSINESS General Bank of America Cor... 70858 \n", "3762 Item 1. \\nBusiness. \\n--------- General Develo... 80424 \n", "4806 ITEM 1. \\nBUSINESS This Annual Report on Form ... 1018724 \n", "\n", " company filing_type filing_date period_of_report \\\n", "1410 APPLE COMPUTER INC 10-K 1999-12-22 1999-09-25 \n", "1560 COCA COLA CO 10-K 2000-03-09 1999-12-31 \n", "2746 BANK OF AMERICA CORP /DE/ 10-K 2000-03-20 1999-12-31 \n", "3762 PROCTER & GAMBLE CO 10-K 1999-09-15 1999-06-30 \n", "4806 AMAZON COM INC 10-K 2000-03-29 1999-12-31 \n", "\n", " sic state_of_inc state_location fiscal_year_end \\\n", "1410 3571 CA CA 0930 \n", "1560 2080 DE GA 1231 \n", "2746 6021 DE NC 1231 \n", "3762 2840 OH OH 0630 \n", "4806 5961 DE WA 1231 \n", "\n", " filing_html_index htm_filing_link \\\n", "1410 https://www.sec.gov/Archives/edgar/data/320193... None \n", "1560 https://www.sec.gov/Archives/edgar/data/21344/... None \n", "2746 https://www.sec.gov/Archives/edgar/data/70858/... None \n", "3762 https://www.sec.gov/Archives/edgar/data/80424/... None \n", "4806 https://www.sec.gov/Archives/edgar/data/101872... None \n", "\n", " complete_text_filing_link \n", "1410 https://www.sec.gov/Archives/edgar/data/320193... \n", "1560 https://www.sec.gov/Archives/edgar/data/21344/... \n", "2746 https://www.sec.gov/Archives/edgar/data/70858/... \n", "3762 https://www.sec.gov/Archives/edgar/data/80424/... \n", "4806 https://www.sec.gov/Archives/edgar/data/101872... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_cfg = eKonf.compose(\"pipeline=blank\")\n", "df_cfg.name = \"edgar_sample\"\n", "df_cfg.path.cache.uri = \"https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/edgar.zip\"\n", "df_cfg.data_dir = df_cfg.path.cached_path\n", "df_cfg.data_dir += \"/edgar\"\n", "df_cfg.data_file = \"edgar.parquet\"\n", "df_cfg.data_columns = [\"id\", \"filename\", \"item\", \"cik\", \"company\", \"text\"]\n", "df = eKonf.instantiate(df_cfg)\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "it1Y0MVPXRsJ" }, "source": [ "## Process a pipeline with the ekorpkit configs" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_target_': 'ekorpkit.datasets.corpus.Corpus',\n", " 'auto': {'load': True, 'merge': False},\n", " 'column_info': {'_target_': 'ekorpkit.info.column.CorpusInfo',\n", " 'columns': {'id': 'id',\n", " 'merge_meta_on': 'id',\n", " 'text': 'text',\n", " 'timestamp': None},\n", " 'data': {'id': 'int', 'text': 'str'},\n", " 'datetime': {'columns': None,\n", " 'format': None,\n", " 'rcParams': None},\n", " 'meta': None,\n", " 'segment_separator': '\\\\n\\\\n',\n", " 'sentence_separator': '\\\\n',\n", " 'timestamp': {'format': None, 'key': None, 'rcParams': None}},\n", " 'data_dir': '../data',\n", " 'filetype': None,\n", " 'force': {'build': False},\n", " 'metadata_dir': None,\n", " 'name': 'bok_minutes',\n", " 'path': {'cache': {'cache_dir': '/workspace/.cache',\n", " 'extract_archive': True,\n", " 'force_extract': False,\n", " 'path': None,\n", " 'return_parent_dir': True,\n", " 'uri': None,\n", " 'verbose': False},\n", " 'cached_path': None,\n", " 'columns': None,\n", " 'concat_data': False,\n", " 'data_columns': None,\n", " 'data_dir': '../data',\n", " 'data_file': None,\n", " 'filetype': None,\n", " 'name': 'bok_minutes',\n", " 'output_dir': None,\n", " 'output_file': None,\n", " 'root': '/workspace/data/bok_minutes',\n", " 'suffix': None,\n", " 'verbose': False},\n", " 'use_name_as_subdir': True,\n", " 'verbose': False}\n" ] } ], "source": [ "corpus_cfg = eKonf.compose(\"corpus\")\n", "corpus_cfg.name = \"bok_minutes\"\n", "corpus_cfg.data_dir = \"../data\"\n", "eKonf.print(corpus_cfg)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 622, "referenced_widgets": [ "950992891ab246b2923e08086d181019", "4a76fba0cb3b490495b78de41e68f6ca", "05dc93f1c30a4bd7a3be3747f9e66c46", "19e2a84812d8481b8195a03a42948c26", "7d4d4b8bb5104627bf1bcbef48480060", "3b485abaef4c489bb2e3aa0205f58ab2", "2543a92c5b7746c4a00dfb771fc58a79", "fdbceec533a64825bec504e6670885bf", "1f319dceda14417bb85bda53a598ac2d", "ea3204c2fb734226830ce8ead05e1c85", "337ac49550a64265b075ce810e8fe90c" ] }, "id": "efZ77OvmPWeh", "outputId": "c75e6828-ca29-49c3-9b40-de15ea64a7d6", "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:ekorpkit.datasets.base:Loaded info file: ../data/bok_minutes/info-bok_minutes.yaml\n", "INFO:ekorpkit.io.file:Processing [1] files from ['bok_minutes-train.parquet']\n", "INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/bok_minutes/bok_minutes-train.parquet']\n", "INFO:ekorpkit.io.file:Loading data from ../data/bok_minutes/bok_minutes-train.parquet\n", "INFO:ekorpkit.info.column:index: index, index of data: None, columns: ['id', 'text'], id: ['id']\n", "INFO:ekorpkit.info.column:Adding id [split] to ['id']\n", "INFO:ekorpkit.info.column:Added id [split], now ['id', 'split']\n", "INFO:ekorpkit.info.column:Added a column [split] with value [train]\n", "INFO:ekorpkit.io.file:Processing [1] files from ['meta-bok_minutes-train.parquet']\n", "INFO:ekorpkit.io.file:Loading 1 dataframes from ['../data/bok_minutes/meta-bok_minutes-train.parquet']\n", "INFO:ekorpkit.io.file:Loading data from ../data/bok_minutes/meta-bok_minutes-train.parquet\n", "INFO:ekorpkit.info.column:Added a column [split] with value [train]\n", "INFO:ekorpkit.info.column:No timestamp key found\n", "INFO:ekorpkit.io.file:Concatenating 1 dataframes\n", "INFO:ekorpkit.pipelines.pipe:Applying pipeline: OrderedDict([('summary_stats', 'summary_stats')])\n", "INFO:ekorpkit.base:Applying pipe: functools.partial()\n", "INFO:ekorpkit.base:Using batcher with minibatch size: 1\n", "INFO:ekorpkit.utils.batch.batcher: backend: joblib minibatch_size: 1 procs: 230 input_split: False merge_output: True len(data): 1 len(args): 5\n", "apply len_bytes to num_bytes: 100%|██████████| 1/1 [00:08<00:00, 8.51s/it]\n", "INFO:ekorpkit.pipelines.pipe:Saving summary stats: ../data/output/bok_minutes_stats.yaml\n" ] } ], "source": [ "cfg = eKonf.compose(\"pipeline\")\n", "cfg.data.corpus = corpus_cfg\n", "cfg._pipeline_ = [\"summary_stats\"]\n", "cfg.summary_stats.output_dir = \"../data/output\"\n", "cfg.summary_stats.output_file = \"bok_minutes_stats.yaml\"\n", "df = eKonf.instantiate(cfg)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "I7lQRCRJ_r2S", "outputId": "9ee27fd8-eaa0-46ac-ec67-699205f6fed7" }, "outputs": [ { "data": { "text/plain": [ "{'num_examples': 1, 'num_bytes': 88934, 'num_bytes_median': 88934.0, 'num_bytes_max': 88934, 'num_bytes_min': 88934, 'human_bytes': '86.85 KiB'}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "\n", "filepath = os.path.join(cfg.summary_stats.output_dir, cfg.summary_stats.output_file)\n", "eKonf.load(filepath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Melt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_dir = \"../data/fomc\"\n", "fomc_sentiments = eKonf.load_data('fomc_sentiments.parquet', data_dir)\n", "\n", "cfg = eKonf.compose(\"pipeline/pivot\")\n", "cfg.index = \"recent_meeting\"\n", "cfg.columns = \"content_type\"\n", "cfg.values = [\"polarity_mean\", \"polarity_diffusion\", \"num_examples\"]\n", "data = eKonf.pipe(fomc_sentiments, cfg)\n", "data.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "colab": { "name": "corpus.ipynb", "provenance": [] }, "interpreter": { "hash": "f869af7787e6a1c49e09e367fc6e1b81d93d1c6583b43249c80edc047bd13cb2" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 1 }