Build and Load Corpora#
import logging
from ekorpkit import eKonf
logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)
0.1.32+22.g1490fd9.dirty
Build corpora with the ekorpkit configs#
cfg = eKonf.compose('corpus/builtin=_dummy_bok_minutes')
cfg.data_dir = '../data/bok_minutes'
cfg.verbose = True
# eKonf.print(cfg)
db = eKonf.instantiate(cfg)
WARNING:ekorpkit.io.file:No files found for bok_minutes-train.parquet
WARNING:ekorpkit.datasets.build:No datasets found
id filename mdate \
0 BOK_20181130_20181218_S1 BOK_20181130_20181218 2018-11-30 10:00:00
1 BOK_20181130_20181218_S2 BOK_20181130_20181218 2018-11-30 10:00:00
2 BOK_20181130_20181218_S3 BOK_20181130_20181218 2018-11-30 10:00:00
3 BOK_20181130_20181218_S4 BOK_20181130_20181218 2018-11-30 10:00:00
4 BOK_20181130_20181218_S5 BOK_20181130_20181218 2018-11-30 10:00:00
rdate section \
0 2018-12-18 16:00:00 Economic Situation
1 2018-12-18 16:00:00 Foreign Currency
2 2018-12-18 16:00:00 Financial Markets
3 2018-12-18 16:00:00 Monetary Policy
4 2018-12-18 16:00:00 Participants’ Views
text
0 일부 위원은 관련부서에서 지난 3/4분기 중 유로지역 경제성장 부진을 자동차 관련 ...
1 일부 위원은 그동안 글로벌펀드와 패시브펀드의 규모가 크게 확대되어 우리나라 자본유출...
2 일부 위원은 현재 대기업들이 전반적으로는 문제가 없지만, 건설 조선업 등에 속하는 ...
3 일부 위원은 최근 경기상황과 금융불균형 등을 고려할 때 확장적 재정정책의 필요성에는...
4 일부 위원은 최근 실물경제 성장경로의 하방위험이 다소 커진 것으로 보이고 물가도 상...
{'category': 'formal',
'column_info': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text',
'timestamp': 'timestamp'},
'columns': {'id': 'id',
'merge_meta_on': 'id',
'text': 'text',
'timestamp': None},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None},
'meta': {'filename': 'str',
'id': 'int',
'mdate': 'str',
'rdate': 'str'},
'segment_separator': '\\n\\n',
'sentence_separator': '\\n',
'timestamp': {'format': None, 'key': None, 'rcParams': None}},
'description': 'BOK Minutes Corpus',
'fullname': 'BOK MPB Minutes',
'homepage': 'https://www.bok.or.kr',
'info_updated': '2022-06-15 08:49:00',
'lang': 'ko',
'license': 'Bank of Korea',
'name': 'bok_minutes',
'version': '1.0.0'}
{'category': 'formal',
'column_info': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text',
'timestamp': 'timestamp'},
'columns': {'id': 'id',
'merge_meta_on': 'id',
'text': 'text',
'timestamp': None},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None},
'meta': {'filename': 'str',
'id': 'int',
'mdate': 'str',
'rdate': 'str'},
'segment_separator': '\\n\\n',
'sentence_separator': '\\n',
'timestamp': {'format': None, 'key': None, 'rcParams': None}},
'description': 'BOK Minutes Corpus',
'fullname': 'BOK MPB Minutes',
'homepage': 'https://www.bok.or.kr',
'info_updated': '2022-06-15 09:00:59',
'lang': 'ko',
'license': 'Bank of Korea',
'name': 'bok_minutes',
'version': '1.0.0'}
cfg = eKonf.compose("corpus/builtin=_dummy_fomc_minutes")
cfg.data_dir = "../data/fomc_minutes"
db = eKonf.instantiate(cfg)
db.build()
Instantiating corpora#
cfg = eKonf.compose('corpus=corpora')
cfg.name = ['bok_minutes', 'fomc_minutes']
cfg.data_dir = '../data'
cfg.auto.load = True
crps = eKonf.instantiate(cfg)
print(crps)
Corpora
----------
bok_minutes
fomc_minutes
crps['bok_minutes'].data
id | text | split | |
---|---|---|---|
index | |||
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train |
crps['fomc_minutes'].data
id | text | content_type | split | |
---|---|---|---|---|
index | ||||
0 | 0 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
1 | 1 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
2 | 2 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
3 | 3 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
4 | 4 | A meeting of the Federal Open Market Committee... | fomc_minutes | train |
crps.concat_corpora()
crps.data
id | text | split | corpus | content_type | |
---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train | bok_minutes | NaN |
1 | 0 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
2 | 1 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
3 | 2 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
4 | 3 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
5 | 4 | A meeting of the Federal Open Market Committee... | train | fomc_minutes | fomc_minutes |
crps.metadata
id | mdate | rdate | filename | split | corpus | date | speaker | title | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 | train | bok_minutes | NaN | NaN | NaN |
1 | 0 | NaN | NaN | NaN | train | fomc_minutes | 1993-02-03 | Alan Greenspan | FOMC Meeting Minutes |
2 | 1 | NaN | NaN | NaN | train | fomc_minutes | 1993-03-23 | Alan Greenspan | FOMC Meeting Minutes |
3 | 2 | NaN | NaN | NaN | train | fomc_minutes | 1993-05-18 | Alan Greenspan | FOMC Meeting Minutes |
4 | 3 | NaN | NaN | NaN | train | fomc_minutes | 1993-07-07 | Alan Greenspan | FOMC Meeting Minutes |
5 | 4 | NaN | NaN | NaN | train | fomc_minutes | 1993-08-17 | Alan Greenspan | FOMC Meeting Minutes |
Instantiating a corpus#
cfg = eKonf.compose('corpus')
cfg.name = 'bok_minutes'
cfg.data_dir = '../data'
cfg.column_info.timestamp.key = 'mdate'
crps = eKonf.instantiate(cfg)
print(crps)
Corpus : bok_minutes
crps.data
id | text | split | |
---|---|---|---|
index | |||
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train |
crps.metadata
id | mdate | rdate | filename | split | |
---|---|---|---|---|---|
0 | 0 | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 | train |
print(crps.ID, crps.IDs, crps.TEXT, crps.DATA, crps.METADATA)
id ['id', 'split'] text ['id', 'text', 'split'] ['id', 'mdate', 'rdate', 'filename', 'split']
crps.merge_metadata()
crps.data
id | text | split | mdate | rdate | filename | |
---|---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 |
crps.COLUMN.TIMESTAMP_INFO.key = 'mdate'
crps.load_timestamp()
crps.data
id | text | split | mdate | rdate | filename | timestamp | |
---|---|---|---|---|---|---|---|
0 | 0 | Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... | train | 2018-11-30 10:00:00 | 2018-12-18 16:00:00 | BOK_20181130_20181218 | 2018-11-30 10:00:00 |
eKonf.pprint(crps.INFO)
{'category': 'formal',
'column_info': {'_keys_': {'dataset': 'dataset',
'id': 'id',
'split': 'split',
'text': 'text',
'timestamp': 'timestamp'},
'columns': {'id': 'id',
'merge_meta_on': 'id',
'text': 'text',
'timestamp': None},
'data': {'id': 'int', 'text': 'str'},
'datetime': {'columns': None,
'format': None,
'rcParams': None},
'meta': {'filename': 'str',
'id': 'int',
'mdate': 'str',
'rdate': 'str'},
'segment_separator': '\\n\\n',
'sentence_separator': '\\n',
'timestamp': {'format': None, 'key': None, 'rcParams': None}},
'data_files': {'train': 'bok_minutes-train.parquet'},
'data_files_modified': '2022-06-14 02:24:20',
'description': 'BOK Minutes Corpus',
'fullname': 'BOK MPB Minutes',
'homepage': 'https://www.bok.or.kr',
'info_updated': '2022-06-14 02:24:21',
'lang': 'ko',
'license': 'Bank of Korea',
'meta_files': {'train': 'meta-bok_minutes-train.parquet'},
'meta_files_modified': '2022-06-14 02:24:20',
'name': 'bok_minutes',
'num_bytes_before_processing': 88948,
'num_docs': 1,
'num_docs_before_processing': 1,
'num_segments': 5,
'num_sents': 346,
'num_words': 8171,
'size_in_bytes': 88925,
'size_in_human_bytes': '86.84 KiB',
'splits': {'train': {'data_file': 'bok_minutes-train.parquet',
'dataset_name': 'bok_minutes',
'human_bytes': '86.84 KiB',
'human_bytes_wospc': '78.86 KiB',
'meta_file': 'meta-bok_minutes-train.parquet',
'name': 'train',
'num_bytes': 88925,
'num_bytes_before_processing': 88948,
'num_bytes_max': 88925,
'num_bytes_median': 88925.0,
'num_bytes_min': 88925,
'num_bytes_wospc': 80751,
'num_docs': 1,
'num_docs_before_processing': 1,
'num_segments': 5,
'num_segments_median': 5.0,
'num_sents': 346,
'num_sents_median': 346.0,
'num_words': 8171,
'num_words_max': 8171,
'num_words_median': 8171.0,
'num_words_min': 8171}},
'version': '1.0.0'}