Build and Load Corpora#

import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)
0.1.32+22.g1490fd9.dirty

Build corpora with the ekorpkit configs#

cfg = eKonf.compose('corpus/builtin=_dummy_bok_minutes')
cfg.data_dir = '../data/bok_minutes'
cfg.verbose = True
# eKonf.print(cfg)
db = eKonf.instantiate(cfg)
WARNING:ekorpkit.io.file:No files found for bok_minutes-train.parquet
WARNING:ekorpkit.datasets.build:No datasets found
                         id               filename                mdate  \
0  BOK_20181130_20181218_S1  BOK_20181130_20181218  2018-11-30 10:00:00   
1  BOK_20181130_20181218_S2  BOK_20181130_20181218  2018-11-30 10:00:00   
2  BOK_20181130_20181218_S3  BOK_20181130_20181218  2018-11-30 10:00:00   
3  BOK_20181130_20181218_S4  BOK_20181130_20181218  2018-11-30 10:00:00   
4  BOK_20181130_20181218_S5  BOK_20181130_20181218  2018-11-30 10:00:00   

                 rdate              section  \
0  2018-12-18 16:00:00   Economic Situation   
1  2018-12-18 16:00:00     Foreign Currency   
2  2018-12-18 16:00:00    Financial Markets   
3  2018-12-18 16:00:00      Monetary Policy   
4  2018-12-18 16:00:00  Participants’ Views   

                                                text  
0  일부 위원은 관련부서에서 지난 3/4분기 중 유로지역 경제성장 부진을 자동차 관련 ...  
1  일부 위원은 그동안 글로벌펀드와 패시브펀드의 규모가 크게 확대되어 우리나라 자본유출...  
2  일부 위원은 현재 대기업들이 전반적으로는 문제가 없지만, 건설 조선업 등에 속하는 ...  
3  일부 위원은 최근 경기상황과 금융불균형 등을 고려할 때 확장적 재정정책의 필요성에는...  
4  일부 위원은 최근 실물경제 성장경로의 하방위험이 다소 커진 것으로 보이고 물가도 상...  
{'category': 'formal',
 'column_info': {'_keys_': {'dataset': 'dataset',
                            'id': 'id',
                            'split': 'split',
                            'text': 'text',
                            'timestamp': 'timestamp'},
                 'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'filename': 'str',
                          'id': 'int',
                          'mdate': 'str',
                          'rdate': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'description': 'BOK Minutes Corpus',
 'fullname': 'BOK MPB Minutes',
 'homepage': 'https://www.bok.or.kr',
 'info_updated': '2022-06-15 08:49:00',
 'lang': 'ko',
 'license': 'Bank of Korea',
 'name': 'bok_minutes',
 'version': '1.0.0'}
{'category': 'formal',
 'column_info': {'_keys_': {'dataset': 'dataset',
                            'id': 'id',
                            'split': 'split',
                            'text': 'text',
                            'timestamp': 'timestamp'},
                 'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'filename': 'str',
                          'id': 'int',
                          'mdate': 'str',
                          'rdate': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'description': 'BOK Minutes Corpus',
 'fullname': 'BOK MPB Minutes',
 'homepage': 'https://www.bok.or.kr',
 'info_updated': '2022-06-15 09:00:59',
 'lang': 'ko',
 'license': 'Bank of Korea',
 'name': 'bok_minutes',
 'version': '1.0.0'}
cfg = eKonf.compose("corpus/builtin=_dummy_fomc_minutes")
cfg.data_dir = "../data/fomc_minutes"
db = eKonf.instantiate(cfg)
db.build()

Instantiating corpora#

cfg = eKonf.compose('corpus=corpora')
cfg.name = ['bok_minutes', 'fomc_minutes']
cfg.data_dir = '../data'
cfg.auto.load = True
crps = eKonf.instantiate(cfg)
print(crps)
Corpora
----------
bok_minutes
fomc_minutes
crps['bok_minutes'].data
id text split
index
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... train
crps['fomc_minutes'].data
id text content_type split
index
0 0 A meeting of the Federal Open Market Committee... fomc_minutes train
1 1 A meeting of the Federal Open Market Committee... fomc_minutes train
2 2 A meeting of the Federal Open Market Committee... fomc_minutes train
3 3 A meeting of the Federal Open Market Committee... fomc_minutes train
4 4 A meeting of the Federal Open Market Committee... fomc_minutes train
crps.concat_corpora()
crps.data
id text split corpus content_type
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... train bok_minutes NaN
1 0 A meeting of the Federal Open Market Committee... train fomc_minutes fomc_minutes
2 1 A meeting of the Federal Open Market Committee... train fomc_minutes fomc_minutes
3 2 A meeting of the Federal Open Market Committee... train fomc_minutes fomc_minutes
4 3 A meeting of the Federal Open Market Committee... train fomc_minutes fomc_minutes
5 4 A meeting of the Federal Open Market Committee... train fomc_minutes fomc_minutes
crps.metadata
id mdate rdate filename split corpus date speaker title
0 0 2018-11-30 10:00:00 2018-12-18 16:00:00 BOK_20181130_20181218 train bok_minutes NaN NaN NaN
1 0 NaN NaN NaN train fomc_minutes 1993-02-03 Alan Greenspan FOMC Meeting Minutes
2 1 NaN NaN NaN train fomc_minutes 1993-03-23 Alan Greenspan FOMC Meeting Minutes
3 2 NaN NaN NaN train fomc_minutes 1993-05-18 Alan Greenspan FOMC Meeting Minutes
4 3 NaN NaN NaN train fomc_minutes 1993-07-07 Alan Greenspan FOMC Meeting Minutes
5 4 NaN NaN NaN train fomc_minutes 1993-08-17 Alan Greenspan FOMC Meeting Minutes

Instantiating a corpus#

cfg = eKonf.compose('corpus')
cfg.name = 'bok_minutes'
cfg.data_dir = '../data'
cfg.column_info.timestamp.key = 'mdate'
crps = eKonf.instantiate(cfg)
print(crps)
Corpus : bok_minutes
crps.data
id text split
index
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... train
crps.metadata
id mdate rdate filename split
0 0 2018-11-30 10:00:00 2018-12-18 16:00:00 BOK_20181130_20181218 train
print(crps.ID, crps.IDs, crps.TEXT, crps.DATA, crps.METADATA)
id ['id', 'split'] text ['id', 'text', 'split'] ['id', 'mdate', 'rdate', 'filename', 'split']
crps.merge_metadata()
crps.data
id text split mdate rdate filename
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... train 2018-11-30 10:00:00 2018-12-18 16:00:00 BOK_20181130_20181218
crps.COLUMN.TIMESTAMP_INFO.key = 'mdate'
crps.load_timestamp()
crps.data
id text split mdate rdate filename timestamp
0 0 Economic Situation\n일부 위원은 관련부서에서 지난 3\/4분기 중 ... train 2018-11-30 10:00:00 2018-12-18 16:00:00 BOK_20181130_20181218 2018-11-30 10:00:00
eKonf.pprint(crps.INFO)
{'category': 'formal',
 'column_info': {'_keys_': {'dataset': 'dataset',
                            'id': 'id',
                            'split': 'split',
                            'text': 'text',
                            'timestamp': 'timestamp'},
                 'columns': {'id': 'id',
                             'merge_meta_on': 'id',
                             'text': 'text',
                             'timestamp': None},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None},
                 'meta': {'filename': 'str',
                          'id': 'int',
                          'mdate': 'str',
                          'rdate': 'str'},
                 'segment_separator': '\\n\\n',
                 'sentence_separator': '\\n',
                 'timestamp': {'format': None, 'key': None, 'rcParams': None}},
 'data_files': {'train': 'bok_minutes-train.parquet'},
 'data_files_modified': '2022-06-14 02:24:20',
 'description': 'BOK Minutes Corpus',
 'fullname': 'BOK MPB Minutes',
 'homepage': 'https://www.bok.or.kr',
 'info_updated': '2022-06-14 02:24:21',
 'lang': 'ko',
 'license': 'Bank of Korea',
 'meta_files': {'train': 'meta-bok_minutes-train.parquet'},
 'meta_files_modified': '2022-06-14 02:24:20',
 'name': 'bok_minutes',
 'num_bytes_before_processing': 88948,
 'num_docs': 1,
 'num_docs_before_processing': 1,
 'num_segments': 5,
 'num_sents': 346,
 'num_words': 8171,
 'size_in_bytes': 88925,
 'size_in_human_bytes': '86.84 KiB',
 'splits': {'train': {'data_file': 'bok_minutes-train.parquet',
                      'dataset_name': 'bok_minutes',
                      'human_bytes': '86.84 KiB',
                      'human_bytes_wospc': '78.86 KiB',
                      'meta_file': 'meta-bok_minutes-train.parquet',
                      'name': 'train',
                      'num_bytes': 88925,
                      'num_bytes_before_processing': 88948,
                      'num_bytes_max': 88925,
                      'num_bytes_median': 88925.0,
                      'num_bytes_min': 88925,
                      'num_bytes_wospc': 80751,
                      'num_docs': 1,
                      'num_docs_before_processing': 1,
                      'num_segments': 5,
                      'num_segments_median': 5.0,
                      'num_sents': 346,
                      'num_sents_median': 346.0,
                      'num_words': 8171,
                      'num_words_max': 8171,
                      'num_words_median': 8171.0,
                      'num_words_min': 8171}},
 'version': '1.0.0'}