Corpus task pipelines#
import logging
from ekorpkit import eKonf
logging.basicConfig(level=logging.WARNING)
print(eKonf.__version__)
0.1.32+2.g8216333.dirty
Apply a pipeline to Corpus#
corpus_cfg = eKonf.compose(config_group="corpus")
corpus_cfg.verbose = False
corpus_cfg.name = "bok_minutes"
corpus_cfg.automerge = True
corpus_cfg.data_dir = "../data"
cfg = eKonf.compose(config_group="pipeline")
cfg.verbose = False
cfg.data.corpus = corpus_cfg
cfg._pipeline_ = ["filter_query", "save_dataframe"]
cfg.filter_query.query = "filename in ['BOK_20181130_20181218']"
cfg.save_dataframe.output_dir = "../data/bok_minutes"
cfg.save_dataframe.output_file = "corpus_filtered.parquet"
data = eKonf.instantiate(cfg)
data.tail()
id | text | mdate | rdate | filename | |
---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 |
Apply a pipeline to Corpora#
corpus_cfg = eKonf.compose(config_group="corpus=corpora")
corpus_cfg.verbose = False
corpus_cfg.name = ["bok_minutes", "fomc_minutes"]
corpus_cfg.automerge = True
corpus_cfg.data_dir = "../data"
cfg = eKonf.compose(config_group="pipeline")
cfg.verbose = False
cfg.data.corpus = corpus_cfg
cfg._pipeline_ = ["filter_query", "save_dataframe"]
cfg.filter_query.query = "id == 0"
cfg.save_dataframe.output_dir = "../data/tmp"
cfg.save_dataframe.output_file = "corpora_filtered.parquet"
data = eKonf.instantiate(cfg)
data.tail()
id | text | corpus | |
---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | bok_minutes |
1 | 0 | A meeting of the Federal Open Market Committee... | fomc_minutes |