# Build and Load Datasets

In [1]:
import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.WARNING)
print(eKonf.__version__)

0.1.32+14.g05da54e.dirty


## Build a dataset with the ekorpkit configs

In [2]:
ds_name = "sst2"
cfg = eKonf.compose("dataset/simple=" + ds_name)
cfg.data_dir = "../data/" + ds_name
cfg.io.data_dir = cfg.data_dir
cfg.io.overwrite = True
cfg.io.calculate_stats = True
db = eKonf.instantiate(cfg)



Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})


apply len_bytes to num_bytes:   0%|          | 0/67349 [00:00<?, ?it/s]



Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 1821
})


apply len_bytes to num_bytes:   0%|          | 0/1821 [00:00<?, ?it/s]



Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 872
})


apply len_bytes to num_bytes:   0%|          | 0/872 [00:00<?, ?it/s]

In [3]:
ds_name = "nsmc"
cfg = eKonf.compose("dataset/simple=" + ds_name)
cfg.data_dir = "../data/" + ds_name
cfg.io.data_dir = cfg.data_dir
cfg.io.overwrite = True
cfg.io.calculate_stats = True
db = eKonf.instantiate(cfg)



Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 150000
})


apply len_bytes to num_bytes:   0%|          | 0/150000 [00:00<?, ?it/s]



Dataset({
    features: ['id', 'document', 'label'],
    num_rows: 50000
})


apply len_bytes to num_bytes:   0%|          | 0/50000 [00:00<?, ?it/s]

## Instantiating datasets

In [2]:
cfg = eKonf.compose('dataset=datasets')
cfg.datasets = ['nsmc', 'sst2']
cfg.data_dir = '../data'
cfg.verbose = False
ds = eKonf.instantiate(cfg)
print(ds)

Datasets
----------
nsmc
sst2



In [3]:
print(ds.COLUMN)

DatasetInfo :
{'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': 'id', 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'id': 'int', 'text': 'str'}}


In [4]:
# ds.concat_datasets()
ds.persist()



In [5]:
print(ds.COLUMN)

DatasetInfo :
{'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': 'id', 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'subset': 'object', 'labels': 'object', 'text': 'object', 'id': 'int64', 'split': 'object'}}


In [6]:
print(ds.INFO)

{'column_info': {'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': 'id', 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'id': 'int', 'text': 'str'}, '_target_': 'ekorpkit.info.column.DatasetInfo'}, 'path': {'root': '/workspace/data/None', 'name': None, 'cached_path': None, 'filetype': '.parquet', 'verbose': False, 'data_dir': '../data', 'data_file': None, 'concat_data': False, 'data_columns': None, 'columns': None, 'output_dir': None, 'output_file': None, 'suffix': None, 'cache': {'uri': None, 'extract_archive': True, 'force_extract': False, 'return_parent_dir': True, 'cache_dir': '/workspace/.cache', 'verbose': False, 'path': None}}, 'auto': {'load': True, 'build': False}, 'force': {'rebuild': False}, 'info': {'stats': {'_func_': {'len_bytes': {'_partial_': True, '_target_': 'ekorpkit.utils.func.len_bytes'}}, '_target_': 'ekorpkit.info.stat.summary_stats', '_partial_': True, 'num_workers': 1, '

In [7]:
print(f"Name of a new dataset: {ds.name}")

Name of a new dataset: nsmc-sst2


## Instantiating a dataset

In [8]:
cfg = eKonf.compose('dataset')
cfg.name = 'nsmc-sst2'
cfg.data_dir = '../data'
cfg.verbose = False
ds = eKonf.instantiate(cfg)
print(ds)

Dataset : nsmc-sst2


In [9]:
print(ds.COLUMN)

DatasetInfo :
{'_keys_': {'id': 'id', 'text': 'text', 'split': 'split', 'dataset': 'dataset'}, 'columns': {'id': ['id', 'split'], 'text': 'text'}, 'datetime': {'columns': None, 'format': None, 'rcParams': None}, 'data': {'id': 'int64', 'split': 'object', 'labels': 'object', 'subset': 'object', 'text': 'object', '_id_': 'int64', 'dataset': 'object'}}


In [10]:
ds.splits['train'].dtypes

id          int64
split      object
labels     object
subset     object
text       object
_id_        int64
dataset    object
dtype: object

In [11]:
eKonf.print(ds.INFO)

{'column_info': {'_keys_': {'dataset': 'dataset',
                            'id': 'id',
                            'split': 'split',
                            'text': 'text'},
                 'columns': {'id': 'id', 'text': 'text'},
                 'data': {'id': 'int64',
                          'labels': 'object',
                          'split': 'object',
                          'subset': 'object',
                          'text': 'object'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None}},
 'data_files': {'test': 'nsmc-sst2-test.parquet',
                'train': 'nsmc-sst2-train.parquet'},
 'data_files_modified': '2022-06-13 10:01:36',
 'info_updated': '2022-06-14 03:10:52',
 'meta_files': {},
 'name': 'nsmc-sst2',
 'num_examples': 269170,
 'size_in_bytes': 21104544,
 'size_in_human_bytes': '20.13 MiB',
 'splits': {'test': {'data_file': 'nsmc-sst2-test.parquet',
              

In [12]:
ds.splits

{'train':             id  split    labels subset  \
 index                                    
 0            0  train  negative   None   
 1            1  train  positive   None   
 2            2  train  negative   None   
 3            3  train  negative   None   
 4            4  train  positive   None   
 ...        ...    ...       ...    ...   
 217344  217344  train  positive   sst2   
 217345  217345  train  negative   sst2   
 217346  217346  train  positive   sst2   
 217347  217347  train  positive   sst2   
 217348  217348  train  negative   sst2   
 
                                                      text   _id_ dataset  
 index                                                                     
 0                                     아 더빙.. 진짜 짜증나네요 목소리      0    nsmc  
 1                       흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1    nsmc  
 2                                       너무재밓었다그래서보는것을추천한다      2    nsmc  
 3                           교도소 이야기구먼 ..솔직히 재미는 없