Improving classification datasets#

from ekorpkit import eKonf

eKonf.setLogger("INFO")
print("version:", eKonf.__version__)

is_colab = eKonf.is_colab()
print("is colab?", is_colab)
if is_colab:
    eKonf.mount_google_drive()
workspace_dir = "/workspace"
project_name = "ekorpkit-book/exmaples/esg"
ws = eKonf.set_workspace(workspace=workspace_dir, project=project_name)
print("project_dir:", ws.project_dir)
ws.envs.dict()
Hide code cell output
INFO:ekorpkit.utils.notebook:Google Colab not detected.
INFO:ekorpkit.base:Set environment variable EKORPKIT_PROJECT=ekorpkit-book/exmaples/esg
INFO:ekorpkit.base:Set environment variable EKORPKIT_PROJECT_DIR=/workspace/projects/ekorpkit-book/exmaples/esg
version: 0.1.40.post0.dev50
is colab? False
INFO:root:compose config with overrides: ['project=default']
INFO:ekorpkit.base:There are no arguments to initilize a config, using default config.
project_dir: /workspace/projects/ekorpkit-book/exmaples/esg
{'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'EKORPKIT_PROJECT': 'ekorpkit-book/exmaples/esg',
 'EKORPKIT_PROJECT_DIR': '/workspace/projects/ekorpkit-book/exmaples/esg',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_LOG_LEVEL': 'INFO',
 'NUM_WORKERS': 230,
 'KMP_DUPLICATE_LIB_OK': 'TRUE',
 'CUDA_DEVICE_ORDER': None,
 'CUDA_VISIBLE_DEVICES': None,
 'WANDB_PROJECT': None,
 'WANDB_DISABLED': None,
 'LABEL_STUDIO_SERVER': 'http://ekorpkit-labelstudio:8080',
 'CACHED_PATH_CACHE_ROOT': None}
time: 1.1 s (started: 2022-12-12 10:24:03 +00:00)

Preparing esg_topics dataset#

ds_cfg = eKonf.compose('dataset')
ds_cfg.name = 'esg_topics'
ds_cfg.data_dir = '/workspace/data/datasets/simple'
eKonf.print(ds_cfg)
# ds = eKonf.instantiate(ds_cfg)
# labels = list(ds.splits['train'].labels.unique())
# print(labels)
INFO:root:compose config with overrides: ['dataset=default']
{'_target_': 'ekorpkit.datasets.dataset.Dataset',
 'auto': {'build': False, 'load': True},
 'column_info': {'_target_': 'ekorpkit.info.column.DatasetInfo',
                 'columns': {'id': 'id', 'text': 'text'},
                 'data': {'id': 'int', 'text': 'str'},
                 'datetime': {'columns': None,
                              'format': None,
                              'rcParams': None}},
 'data_dir': '/workspace/data/datasets/simple',
 'filetype': '.parquet',
 'force': {'build': False},
 'info': {'_target_': 'ekorpkit.info.stat.SummaryInfo',
          'aggregate_info': {'num_examples': 'num_examples',
                             'size_in_bytes': 'num_bytes'},
          'data_dir': '/workspace/data/datasets/simple',
          'info_file': 'info-esg_topics.yaml',
          'info_list': ['name',
                        'fullname',
                        'domain',
                        'task',
                        'lang',
                        'description',
                        'license',
                        'homepage',
                        'version',
                        'num_examples',
                        'size_in_bytes',
                        'size_in_human_bytes',
                        'data_files_modified',
                        'info_updated',
                        'data_files',
                        'column_info'],
          'key_columns': None,
          'modified_info': {'data_files_modified': 'data_file'},
          'name': 'esg_topics',
          'stats': {'_func_': {'len_bytes': {'_partial_': True,
                                             '_target_': 'ekorpkit.utils.func.len_bytes'}},
                    '_partial_': True,
                    '_target_': 'ekorpkit.info.stat.summary_stats',
                    'agg_funcs': {'num_bytes': ['count',
                                                'sum',
                                                'median',
                                                'max',
                                                'min']},
                    'convert_to_humanbytes': {'num_bytes': 'human_bytes'},
                    'key_columns': None,
                    'num_columns': {'num_bytes': 'len_bytes'},
                    'num_workers': 1,
                    'rename_columns': {'num_bytes_count': 'num_examples',
                                       'num_bytes_sum': 'num_bytes'},
                    'text_keys': 'text'},
          'update_files_info': {'data_files': 'data_file',
                                'meta_files': 'meta_file'},
          'update_info': ['fullname',
                          'lang',
                          'domain',
                          'task',
                          'description',
                          'license',
                          'homepage',
                          'version'],
          'verbose': False},
 'name': 'esg_topics',
 'path': {'cache': {'cache_dir': '/root/.ekorpkit/.cache',
                    'extract_archive': True,
                    'force_extract': False,
                    'path': None,
                    'return_parent_dir': True,
                    'uri': None,
                    'verbose': False},
          'cached_path': None,
          'columns': None,
          'concat_data': False,
          'data_columns': None,
          'data_dir': '/workspace/data/datasets/simple',
          'data_file': None,
          'filetype': '.parquet',
          'name': 'esg_topics',
          'output_dir': '/root/.ekorpkit/projects/ekorpkit-book/exmaples/esg/esg_topics/outputs',
          'output_file': None,
          'root': '/root/.ekorpkit/projects/ekorpkit-book/exmaples/esg/esg_topics',
          'suffix': None,
          'verbose': False},
 'use_name_as_subdir': True,
 'verbose': False}
time: 599 ms (started: 2022-12-12 10:27:28 +00:00)
remap_cat = {
    'E-신재생에너지 발전': 'E-환경혁신',
    'E-원자력발전': 'E-환경혁신',
    'S-기술혁신': 'S-소비자',
    'S-노조/노사': 'S-고용',
    'S-인적자본': 'S-고용',
    'S-산업재해/안전관리': 'S-재해/안전관리',
    'G-정보공시': 'NA',
    'G-주주환원': 'NA',
}
remap_cat
{'E-신재생에너지 발전': 'E-환경혁신',
 'E-원자력발전': 'E-환경혁신',
 'S-기술혁신': 'S-소비자',
 'S-노조/노사': 'S-고용',
 'S-인적자본': 'S-고용',
 'S-산업재해/안전관리': 'S-재해/안전관리',
 'G-정보공시': 'NA',
 'G-주주환원': 'NA'}
for split, data in ds._splits.items():
    data['labels'] = data['labels'].map(remap_cat).fillna(data['labels'])
    data = data[data.labels != 'NA']
    ds._splits[split] = data
ds.save_as("esg_topics_remapped")
INFO:ekorpkit.base:Using batcher with minibatch size: 39
INFO:ekorpkit.base:Using batcher with minibatch size: 5
INFO:ekorpkit.base:Using batcher with minibatch size: 5
ds_cfg.name = "esg_topics_remapped"
ds = eKonf.instantiate(ds_cfg)
labels = list(ds.splits['train'].labels.unique())
print(labels)
['S-기업(공급망)동반성장/상생', 'G-지배구조', 'G-기업윤리/불공정/소송', 'S-소비자', 'E-환경혁신', 'S-사회공헌', 'S-고용', 'E-환경영향', 'E-기후변화', 'S-재해/안전관리']

Cross validation of esg_topics dataset#

overrides=[
    '+model/transformer=classification',
    '+model/transformer/pretrained=ekonelectra-base',
]
model_cfg = eKonf.compose('model/transformer=classification', overrides)
model_cfg.name = "esg_topics"
model_cfg.dataset = ds_cfg
model_cfg.verbose = False
model_cfg.config.num_train_epochs = 2
model_cfg.config.max_seq_length = 256
model_cfg.config.train_batch_size = 32
model_cfg.config.eval_batch_size = 32
model_cfg._method_ = []
# model_cfg.model.eval.visualize.plot.confusion_matrix.include_values = False
# model_cfg.model.eval.visualize.plot.confusion_matrix.include_percentages = False
# model_cfg.model.eval.visualize.plot.figure.figsize = (12,10)
model = eKonf.instantiate(model_cfg)
INFO:ekorpkit.base:No method defined to call
cv_preds = model.cross_val_predict(cv=5)
eKonf.save_data(cv_preds, "esg_topics_cv_preds.parquet", data_dir)
Hide code cell output
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
wandb: Currently logged in as: entelecheia. Use `wandb login --relogin` to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071028-3bx0elnp
Finishing last run (ID:3bx0elnp) before initializing another...
Waiting for W&B process to finish... (success).

Run history:


Training loss█▇▆▅▃▂▁▁▄
acc▁█
eval_loss█▁
global_step▁▂▃▄▄▄▅▆▇██
lr█▇▆▅▄▄▃▂▁
mcc▁█
train_loss█▁

Run summary:


Training loss1.03131
acc0.76427
eval_loss0.80164
global_step456
lr0.0
mcc0.72096
train_loss0.71639

Synced scarlet-wind-89: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/3bx0elnp
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071028-3bx0elnp/logs
Successfully finished last run (ID:3bx0elnp). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071142-1r3nk1yw
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:1r3nk1yw) before initializing another...
Waiting for W&B process to finish... (success).
Synced fiery-plasma-90: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/1r3nk1yw
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071142-1r3nk1yw/logs
Successfully finished last run (ID:1r3nk1yw). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071214-kpbs288v
Finishing last run (ID:kpbs288v) before initializing another...
Waiting for W&B process to finish... (success).

Run history:


Training loss█▆▄▅▄▂▂▄▁
acc▁█
eval_loss█▁
global_step▁▂▃▄▄▄▅▆▇██
lr█▇▆▅▄▄▃▂▁
mcc▁█
train_loss▁█

Run summary:


Training loss0.47778
acc0.77445
eval_loss0.7631
global_step456
lr0.0
mcc0.73189
train_loss0.85427

Synced sweet-wave-91: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/kpbs288v
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071214-kpbs288v/logs
Successfully finished last run (ID:kpbs288v). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071333-w3krkvu5
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:w3krkvu5) before initializing another...
Waiting for W&B process to finish... (success).
Synced icy-serenity-92: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/w3krkvu5
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071333-w3krkvu5/logs
Successfully finished last run (ID:w3krkvu5). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071403-2vv0hewt
Finishing last run (ID:2vv0hewt) before initializing another...
Waiting for W&B process to finish... (success).

Run history:


Training loss█▅▅▂▃▂▃▁▁
acc▁█
eval_loss█▁
global_step▁▂▃▄▄▄▅▆▇██
lr█▇▆▅▄▄▃▂▁
mcc▁█
train_loss▁█

Run summary:


Training loss0.67537
acc0.76936
eval_loss0.79864
global_step456
lr0.0
mcc0.72731
train_loss0.44434

Synced northern-dream-93: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/2vv0hewt
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071403-2vv0hewt/logs
Successfully finished last run (ID:2vv0hewt). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071523-35lxag5u
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:35lxag5u) before initializing another...
Waiting for W&B process to finish... (success).
Synced iconic-sea-94: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/35lxag5u
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071523-35lxag5u/logs
Successfully finished last run (ID:35lxag5u). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071554-2e7jflgg
Finishing last run (ID:2e7jflgg) before initializing another...
Waiting for W&B process to finish... (success).

Run history:


Training loss█▆▆▃▃▃▂▁▃
acc▁█
eval_loss█▁
global_step▁▂▃▄▄▄▅▆▇██
lr█▇▆▅▄▄▃▂▁
mcc▁█
train_loss█▁

Run summary:


Training loss1.06689
acc0.77671
eval_loss0.78679
global_step456
lr0.0
mcc0.73375
train_loss0.55624

Synced winter-haze-95: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/2e7jflgg
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071554-2e7jflgg/logs
Successfully finished last run (ID:2e7jflgg). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071712-a7mqtfdv
Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at entelecheia/ekonelectra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors
Finishing last run (ID:a7mqtfdv) before initializing another...
Waiting for W&B process to finish... (success).
Synced trim-shadow-96: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/a7mqtfdv
Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071712-a7mqtfdv/logs
Successfully finished last run (ID:a7mqtfdv). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071742-1sxdb0fs
Finishing last run (ID:1sxdb0fs) before initializing another...
Waiting for W&B process to finish... (success).

Run history:


Training loss█▅▆▅▂▁▃▁▂
acc▁█
eval_loss█▁
global_step▁▂▃▄▄▄▅▆▇██
lr█▇▆▅▄▄▃▂▁
mcc▁█
train_loss▁█

Run summary:


Training loss0.89486
acc0.7671
eval_loss0.7825
global_step456
lr0.0
mcc0.72317
train_loss0.5121

Synced super-plasma-97: https://wandb.ai/entelecheia/ekorpkit-book-esg_topics/runs/1sxdb0fs
Synced 4 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
Find logs at: /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071742-1sxdb0fs/logs
Successfully finished last run (ID:1sxdb0fs). Initializing new run:
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tracking run with wandb version 0.13.2
Run data is saved locally in /workspace/projects/ekorpkit-book/outputs/esg_topics/ekonelectra-base/wandb/run-20220906_071900-2obg8t98

Use rubrix to find potential label errors#

rb_cfg = eKonf.compose('model/rubrix')
rb_cfg.auto.init = True
rb = eKonf.instantiate(rb_cfg)
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
rb.get_workspace()
'esgml'
cv_preds = eKonf.load_data("esg_topics_cv_preds.parquet", data_dir)
records = rb.create_records_from_cv_preds(
    cv_preds,
)
records[0]
TextClassificationRecord(text='os 전쟁 구글 , 애플 , ms , 인텔 삼성전자 등 각각 의 iot 전용 os 강화\n업체 들 안드로이드 os 견제 해 아직 채택 안함 센서 고성장 전망 mems ( 반도체 센서 ) 시장 고성장 예상\n다만 , 국내 기술 은 매우 미흡\n인공지능 이 인공지능 이 3 세대 세대 deepdeep learninglearning 의 초기 초기 국면 으로 국면 으로 들어가면서들어가면서 혁신 이 혁신 이 일어나기일어나기 시작 인공지능 연구 가 3 세대 deep learning 의 초기 국면 으로 들어가면서 혁신 이 일어나기 시작 함', inputs={'text': 'os 전쟁 구글 , 애플 , ms , 인텔 삼성전자 등 각각 의 iot 전용 os 강화\n업체 들 안드로이드 os 견제 해 아직 채택 안함 센서 고성장 전망 mems ( 반도체 센서 ) 시장 고성장 예상\n다만 , 국내 기술 은 매우 미흡\n인공지능 이 인공지능 이 3 세대 세대 deepdeep learninglearning 의 초기 초기 국면 으로 국면 으로 들어가면서들어가면서 혁신 이 혁신 이 일어나기일어나기 시작 인공지능 연구 가 3 세대 deep learning 의 초기 국면 으로 들어가면서 혁신 이 일어나기 시작 함'}, prediction=[('E-기후변화', 0.0031880621893694222), ('E-환경영향', 0.005058959626738143), ('E-환경혁신', 0.010744794691714028), ('G-기업윤리/불공정/소송', 0.01250313787700796), ('G-지배구조', 0.018917387394747708), ('S-고용', 0.00931412617366345), ('S-기업(공급망)동반성장/상생', 0.0068541975270435635), ('S-사회공헌', 0.0037565036432219917), ('S-소비자', 0.9259529941013552), ('S-재해/안전관리', 0.003709836775138507)], prediction_agent=None, annotation='S-소비자', annotation_agent=None, multi_label=False, explanation=None, id=None, metadata={'id': 411, 'split': 'train'}, status='Validated', event_timestamp=None, metrics=None, search_keywords=None)
# get records with potential label errors
records_with_label_error = rb.find_label_errors(records)
records_with_label_error[0]
TextClassificationRecord(text='최근 이동통신 시장 은 lte 서비스 도입 으로 큰 변화 를 맞고 있음\nlte 는 3 g 이동통신망 으로 널리 채택 된 wcdma 방식 에서 발전 된 규격 으로 전세계 이동통신사 의 80 % 이상 이 lte 를 채택 하고 있음\n와이브로 는 한국 이 lte 와 경쟁 하고자 만든 차세대 국책사업 중 하나였지만 , 초기 시장 창출 실패 와 이동통신사업자 들 의 견제 , 정부 의 정책 부재 등으로 사업화 에 난항\n국내 lte 서비스 는 상용화 1 년 만에 1 , 000 만명을 넘어선 반면 , 와이브로 는 6 년이 지난 현재 까지도 사용자 가 100 만명 수준 에 불과한 실정\n이러한 통신시장 의 구조적 인 변화 는 와이브로 에 편중 된 동사 에 위기 로 작용 할 수 있음', inputs={'text': '최근 이동통신 시장 은 lte 서비스 도입 으로 큰 변화 를 맞고 있음\nlte 는 3 g 이동통신망 으로 널리 채택 된 wcdma 방식 에서 발전 된 규격 으로 전세계 이동통신사 의 80 % 이상 이 lte 를 채택 하고 있음\n와이브로 는 한국 이 lte 와 경쟁 하고자 만든 차세대 국책사업 중 하나였지만 , 초기 시장 창출 실패 와 이동통신사업자 들 의 견제 , 정부 의 정책 부재 등으로 사업화 에 난항\n국내 lte 서비스 는 상용화 1 년 만에 1 , 000 만명을 넘어선 반면 , 와이브로 는 6 년이 지난 현재 까지도 사용자 가 100 만명 수준 에 불과한 실정\n이러한 통신시장 의 구조적 인 변화 는 와이브로 에 편중 된 동사 에 위기 로 작용 할 수 있음'}, prediction=[('E-기후변화', 0.0030561790387218834), ('E-환경영향', 0.004940841001211258), ('E-환경혁신', 0.011977991797623686), ('G-기업윤리/불공정/소송', 0.00884969377797024), ('G-지배구조', 0.011248100661326789), ('S-고용', 0.006743854005870231), ('S-기업(공급망)동반성장/상생', 0.006392374888576145), ('S-사회공헌', 0.00277139162106223), ('S-소비자', 0.9404210229503303), ('S-재해/안전관리', 0.0035985502573076407)], prediction_agent=None, annotation='E-환경영향', annotation_agent=None, multi_label=False, explanation=None, id=None, metadata={'id': 851, 'split': 'dev', 'label_error_candidate': 0}, status='Validated', event_timestamp=None, metrics=None, search_keywords=None)
len(records_with_label_error)
1408
# uncover label errors in the Rubrix web app
rb.log(records_with_label_error, "esg_topic_label_errors")
1408 records logged to http://ekorpkit-book:6900/datasets/esgml/esg_topic_label_errors

Saving the re-labelled dataset#

relabelled_dataset = rb.load("esg_topic_label_errors")
for split, data in ds._splits.items():
    data = rb.update_label_errors(data, relabelled_dataset, split=split)
    ds._splits[split] = data
len(ds.data[ds.data.labels != ds.data.original_labels])
27
ds.save_as("esg_topics_improved")
ds_cfg.name = "esg_topics_improved"
ds = eKonf.instantiate(ds_cfg)
labels = list(ds.splits['train'].labels.unique())
print(labels)
INFO:ekorpkit.base:Using batcher with minibatch size: 39
INFO:ekorpkit.base:Using batcher with minibatch size: 5
INFO:ekorpkit.base:Using batcher with minibatch size: 5
['S-기업(공급망)동반성장/상생', 'G-지배구조', 'G-기업윤리/불공정/소송', 'S-소비자', 'E-환경혁신', 'S-사회공헌', 'S-고용', 'E-환경영향', 'E-기후변화', 'S-재해/안전관리']
ds.data_dir
'/workspace/data/datasets/simple/esg_topics_improved'