Word2Vec Basics

Contents

Word2Vec Basics#

import logging
from ekorpkit import eKonf

logging.basicConfig(level=logging.WARNING)
print(eKonf.__version__)

0.1.31+3.g3e40284.dirty

import gensim
gensim.__version__

'4.2.0'

English Word2Vec#

Load FOMC Corpus#

corpus_cfg = eKonf.compose(config_group='corpus')
corpus_cfg.name = 'fomc'
corpus_cfg.cache.uri = 'https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/fomc.zip'
corpus_cfg.data_dir = corpus_cfg.cache.path
corpus_cfg.automerge = True
fomc = eKonf.instantiate(corpus_cfg)
print(fomc)

Corpus : fomc

fomc_minutes = fomc.data[fomc.data.content_type == 'fomc_minutes']
fomc_minutes.head()

	id	text	timestamp	content_type	date	speaker	title
0	0	A meeting of the Federal Open Market Committee...	1993-02-03	fomc_minutes	1993-02-03	Alan Greenspan	FOMC Meeting Minutes
1	1	A meeting of the Federal Open Market Committee...	1993-03-23	fomc_minutes	1993-03-23	Alan Greenspan	FOMC Meeting Minutes
2	2	A meeting of the Federal Open Market Committee...	1993-05-18	fomc_minutes	1993-05-18	Alan Greenspan	FOMC Meeting Minutes
3	3	A meeting of the Federal Open Market Committee...	1993-07-07	fomc_minutes	1993-07-07	Alan Greenspan	FOMC Meeting Minutes
4	4	A meeting of the Federal Open Market Committee...	1993-08-17	fomc_minutes	1993-08-17	Alan Greenspan	FOMC Meeting Minutes

tk_cfg = eKonf.compose(config_group='preprocessor/tokenizer=nltk')
tk_cfg.nltk.lemmatize = True
tk_cfg.nltk.stem = False

cfg = eKonf.compose(config_group="pipeline")
cfg._pipeline_ = ["tokenize", "extract_tokens", "explode_splits","reset_index", "save_dataframe"]
cfg.tokenize.preprocessor.tokenizer = tk_cfg
cfg.extract_tokens.preprocessor.tokenizer = tk_cfg
cfg.extract_tokens.nouns_only = True
cfg.explode_splits.id_key = "id"
cfg.explode_splits.split_key = "sent_id"
cfg.explode_splits.separator = "\n"
cfg.reset_index.drop_index = True
cfg.save_dataframe.output_dir = '../data/fomc'
cfg.save_dataframe.output_file = 'fomc_minutes.parquet'
df = eKonf.instantiate(cfg, data=fomc_minutes)
df

	id	text	timestamp	content_type	date	speaker	title	sent_id
0	0	meeting Federal Open Market Committee office B...	1993-02-03	fomc_minutes	1993-02-03	Alan Greenspan	FOMC Meeting Minutes	0
1	0		1993-02-03	fomc_minutes	1993-02-03	Alan Greenspan	FOMC Meeting Minutes	1
2	0	PRESENT	1993-02-03	fomc_minutes	1993-02-03	Alan Greenspan	FOMC Meeting Minutes	2
3	0		1993-02-03	fomc_minutes	1993-02-03	Alan Greenspan	FOMC Meeting Minutes	3
4	0	Mr. Greenspan Chairman Mr. Corrigan Vice Chair...	1993-02-03	fomc_minutes	1993-02-03	Alan Greenspan	FOMC Meeting Minutes	4
...	...	...	...	...	...	...	...	...
43699	229	discussion policy normalization consideration ...	2021-12-15	fomc_minutes	2021-12-15	Jerome Powell	FOMC Meeting Minutes	308
43700	229		2021-12-15	fomc_minutes	2021-12-15	Jerome Powell	FOMC Meeting Minutes	309
43701	229	Board Governors Federal Reserve System	2021-12-15	fomc_minutes	2021-12-15	Jerome Powell	FOMC Meeting Minutes	310
43702	229		2021-12-15	fomc_minutes	2021-12-15	Jerome Powell	FOMC Meeting Minutes	311
43703	229	Street Constitution Avenue N.W. Washington DC	2021-12-15	fomc_minutes	2021-12-15	Jerome Powell	FOMC Meeting Minutes	312

43704 rows × 8 columns

df = df[df.text.str.len() > 0]
print("Total number of sentences : {}".format(len(df)))
df.text.head(10)

Total number of sentences : 21892

   meeting Federal Open Market Committee office B...
                                             PRESENT
   Mr. Greenspan Chairman Mr. Corrigan Vice Chair...
   Messrs. Broaddus Jordan Forrestal Parry Altern...
   Messrs. Hoenig Melzer Syron Presidents Federal...
  Mr. Kohn Secretary Economist Mr. Bernard Deput...
  Messrs. R. Davis Lang Lindsey Promisel Rosenbl...
   Mr. McDonough Manager System Open Market Account
       Ms. Greene Deputy Manager Foreign Operations
    Ms. Lovett,2 Deputy Manager Domestic Operations
Name: text, dtype: object

sentences = []
for sentence in df.text:
    tokens = sentence.lower().split()
    if len(tokens) > 5:
        sentences.append(sentence.lower().split())

from gensim.models import Word2Vec, KeyedVectors

model = Word2Vec(
    sentences=sentences, vector_size=100, window=5, min_count=5, workers=4, sg=0
)

Word2Vec Parameters:

vector_size : int, optional
    Dimensionality of the word vectors.
window : int, optional
    Maximum distance between the current and predicted word within a sentence.
min_count : int, optional
    Ignores all words with total frequency lower than this.
workers : int, optional
    Use these many worker threads to train the model (=faster training with multicore machines).
sg : {0, 1}, optional
    Training algorithm: 1 for skip-gram; otherwise CBOW.

model_result = model.wv.most_similar("fomc")
print(model_result)

[('meeting', 0.7261571884155273), ('call', 0.7087838053703308), ('conference', 0.706003725528717), ('minute', 0.7013736963272095), ('release', 0.6472852230072021), ('press', 0.635810136795044), ('videoconference', 0.6349100470542908), ('statement', 0.6306827068328857), ('announcement', 0.622832715511322), ('morning', 0.6162055730819702)]

model.wv.save_word2vec_format('../data/fomc/fomc_w2v.mdl')
loaded_model = KeyedVectors.load_word2vec_format("../data/fomc/fomc_w2v.mdl")

model_result = loaded_model.most_similar("fomc")
print(model_result)

[('meeting', 0.7261571884155273), ('call', 0.7087838053703308), ('conference', 0.706003725528717), ('minute', 0.7013736963272095), ('release', 0.6472852230072021), ('press', 0.635810136795044), ('videoconference', 0.6349100470542908), ('statement', 0.6306827068328857), ('announcement', 0.622832715511322), ('morning', 0.6162055730819702)]

Korean Word2Vec#

Load BOK Minutes Corpus#

corpus_cfg = eKonf.compose(config_group='corpus')
corpus_cfg.name = 'bok_minutes'
corpus_cfg.cache.uri = 'https://github.com/entelecheia/ekorpkit-book/raw/main/assets/data/bok_minutes.zip'
corpus_cfg.data_dir = corpus_cfg.cache.path
corpus_cfg.automerge = True
bok = eKonf.instantiate(corpus_cfg)
print(bok)

Corpus : bok_minutes

bok_minutes = bok.data.copy()
bok_minutes.head()

	id	text	mdate	rdate	filename
0	0	Economic Situation\n일부 위원은 반도체시장의 재고조정 지속으로 반도...	2005-03-10 10:00:00	2005-05-13 16:00:00	BOK_20050310_20050513
1	1	Economic Situation\n일부 위원은 소비회복의 시기와 폭을 가늠하기 위...	2005-04-07 10:00:00	2005-05-24 16:00:00	BOK_20050407_20050524
2	2	Economic Situation\n일부 위원은 지난해 풍작에 따른 효과와 금년초 ...	2005-05-12 10:00:00	2005-06-28 16:00:00	BOK_20050512_20050628
3	3	Economic Situation\n일부 위원은 최근 소비재판매액 증가세가 다소 둔...	2005-06-09 10:00:00	2005-07-26 16:00:00	BOK_20050609_20050726
4	4	Economic Situation\n일부 위원은 최근 이란의 새 대통령 취임 이후 ...	2005-07-07 10:00:00	2005-08-23 16:00:00	BOK_20050707_20050823

tk_cfg = eKonf.compose(config_group='preprocessor/tokenizer=mecab_econ')

cfg = eKonf.compose(config_group="pipeline")
cfg._pipeline_ = ["tokenize", "extract_tokens", "explode_splits","reset_index", "save_dataframe"]
cfg.tokenize.preprocessor.tokenizer = tk_cfg
cfg.extract_tokens.preprocessor.tokenizer = tk_cfg
cfg.extract_tokens.nouns_only = True
cfg.explode_splits.id_key = "id"
cfg.explode_splits.split_key = "sent_id"
cfg.explode_splits.separator = "\n"
cfg.reset_index.drop_index = True
cfg.save_dataframe.output_dir = '../data/bok'
cfg.save_dataframe.output_file = 'bok_minutes.parquet'
df = eKonf.instantiate(cfg, data=bok_minutes)
df

	id	text	mdate	rdate	filename	sent_id
0	0	Economic Situation	2005-03-10 10:00:00	2005-05-13 16:00:00	BOK_20050310_20050513	0
1	0	일부 위원 반도체시장 재고조정 지속 반도체 가격 하락 불구 중 반도체 제조 장비 투...	2005-03-10 10:00:00	2005-05-13 16:00:00	BOK_20050310_20050513	1
2	0	관련 부서 반도체 경기 인식 것 반도체 가격 폭 하락 데 기인 것 가격 하락 반면 ...	2005-03-10 10:00:00	2005-05-13 16:00:00	BOK_20050310_20050513	2
3	0	위원 분기 중 경제성장률 당초 전망 수 보고 내용 관련 경우 GDP 마이너스 갭 축...	2005-03-10 10:00:00	2005-05-13 16:00:00	BOK_20050310_20050513	3
4	0	관련 부서 분기 중 경제성장률 경기회복 시기 경우 GDP 갭 축소 수요 측면 물가 ...	2005-03-10 10:00:00	2005-05-13 16:00:00	BOK_20050310_20050513	4
...	...	...	...	...	...	...
33852	162	앞 코로나 충격 회복 해외 완화적 통화정책 조정 것 예상	2021-11-25 10:00:00	2021-12-14 16:00:00	BOK_20211125_20211214	337
33853	162	글로벌 공급망 재편 기후변화 디지털 경제 전환 등 가속 화 가운데 미 중 갈등 중국...	2021-11-25 10:00:00	2021-12-14 16:00:00	BOK_20211125_20211214	338
33854	162	향후 경제 회복세 물가 흐름 금융시장 상황 경제 주체 들 수용 등 점검 금리 중립적...	2021-11-25 10:00:00	2021-12-14 16:00:00	BOK_20211125_20211214	339
33855	162		2021-11-25 10:00:00	2021-12-14 16:00:00	BOK_20211125_20211214	340
33856	162	Government s View	2021-11-25 10:00:00	2021-12-14 16:00:00	BOK_20211125_20211214	341

33857 rows × 6 columns

df = df[df.text.str.len() > 0]
print("Total number of sentences : {}".format(len(df)))
df.text.head(10)

Total number of sentences : 33027

                                 Economic Situation
  일부 위원 반도체시장 재고조정 지속 반도체 가격 하락 불구 중 반도체 제조 장비 투...
  관련 부서 반도체 경기 인식 것 반도체 가격 폭 하락 데 기인 것 가격 하락 반면 ...
  위원 분기 중 경제성장률 당초 전망 수 보고 내용 관련 경우 GDP 마이너스 갭 축...
  관련 부서 분기 중 경제성장률 경기회복 시기 경우 GDP 갭 축소 수요 측면 물가 ...
  위원 분기 중 국제유가 현재 브렌트유 기준 두바이유 기준 지속 소비자물가 중반 것 ...
  관련 부서 유가 현재 정도 상승 소비자물가 중반 상승 것 예상 이상기후 농수산물 가...
  일부 위원 관련 부서 작성 주요 경제지표 설비투자 작년 전년동기대비 증가세 이후 감...
  관련 부서 설비투자 작년 분기 증가세 전환 동안 대 증가세 유지 분기 감소세 수준면...
  위원 GDP 갭 관련 분기 분기 경제성장률 상승 정도 시차 GDP 갭 효과 것 성장...
Name: text, dtype: object

sentences = []
for sentence in df.text:
    tokens = sentence.lower().split()
    if len(tokens) > 5:
        sentences.append(sentence.lower().split())

from gensim.models import Word2Vec, KeyedVectors

model = Word2Vec(
    sentences=sentences, vector_size=100, window=5, min_count=5, workers=4, sg=0
)

model_result = model.wv.most_similar("금통위")
print(model_result)

[('회의', 0.8942065238952637), ('금융통화위원회', 0.8351814150810242), ('통방', 0.8271216750144958), ('fomc', 0.8267607092857361), ('개최', 0.7971529960632324), ('지난번', 0.791086733341217), ('한국은행', 0.7783183455467224), ('결문', 0.7717291116714478), ('지난달', 0.7497071623802185), ('금번', 0.7485520243644714)]

model.wv.save_word2vec_format('../data/fomc/bok_w2v.mdl')
loaded_model = KeyedVectors.load_word2vec_format("../data/fomc/bok_w2v.mdl")

model_result = loaded_model.most_similar("금통위")
print(model_result)

[('회의', 0.8942065238952637), ('금융통화위원회', 0.8351814150810242), ('통방', 0.8271216750144958), ('fomc', 0.8267607092857361), ('개최', 0.7971529960632324), ('지난번', 0.791086733341217), ('한국은행', 0.7783183455467224), ('결문', 0.7717291116714478), ('지난달', 0.7497071623802185), ('금번', 0.7485520243644714)]

model_result = loaded_model.most_similar("금리")
print(model_result)

[('장기금리', 0.7175424695014954), ('단기금리', 0.7011380791664124), ('정책금리', 0.6988612413406372), ('수신금리', 0.6762762665748596), ('수익률곡선', 0.6748917102813721), ('가산금리', 0.6711479425430298), ('시장금리', 0.644707977771759), ('대출금리', 0.6405514478683472), ('지급준비율', 0.6337494254112244), ('금리수준', 0.6313076019287109)]