Lab 2: EDA on Corpora#
Prepare the environment#
%pip install --pre ekorpkit[dataset]
%config InlineBackend.figure_format='retina'
%load_ext autotime
from ekorpkit import eKonf
eKonf.setLogger("INFO")
print("version:", eKonf.__version__)
is_colab = eKonf.is_colab()
print("is colab?", is_colab)
if is_colab:
eKonf.mount_google_drive()
workspace_dir = "/content/drive/MyDrive/workspace"
project_name = "ekorpkit-book"
project_dir = eKonf.set_workspace(workspace=workspace_dir, project=project_name)
print("project_dir:", project_dir)
INFO:ekorpkit.utils.notebook:Google Colab not detected.
INFO:ekorpkit.base:Setting EKORPKIT_WORKSPACE_ROOT to /content/drive/MyDrive/workspace
INFO:ekorpkit.base:Setting EKORPKIT_PROJECT to ekorpkit-book
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
version: 0.1.40.post0.dev21
is colab? False
project_dir: /content/drive/MyDrive/workspace/projects/ekorpkit-book
time: 1.39 s (started: 2022-11-16 23:58:16 +00:00)
Load the saved corpora#
data = eKonf.load_data("wiki_corpus.parquet", project_dir + "/data")
INFO:ekorpkit.io.file:Processing [1] files from ['wiki_corpus.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/content/drive/MyDrive/workspace/projects/ekorpkit-book/data/wiki_corpus.parquet']
INFO:ekorpkit.io.file:Loading data from /content/drive/MyDrive/workspace/projects/ekorpkit-book/data/wiki_corpus.parquet
time: 14 s (started: 2022-11-17 00:04:02 +00:00)
data.head()
id | text | split | filename | corpus | |
---|---|---|---|---|---|
0 | 4915400 | train | wiki_92 | enwiki_sampled | |
1 | 7644961 | Anaissini is a tribe of click beetles in the f... | train | wiki_49 | enwiki_sampled |
2 | 6658552 | The Vicky Metcalf Award for Literature for You... | train | wiki_24 | enwiki_sampled |
3 | 16385169 | Shri Shivabalayogi Maharaj (24 January 1935 – ... | train | wiki_36 | enwiki_sampled |
4 | 11081255 | Eylex Films Pvt is a chain of multiplex and si... | train | wiki_94 | enwiki_sampled |
time: 10.2 ms (started: 2022-11-17 00:04:17 +00:00)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522593 entries, 0 to 2522592
Data columns (total 5 columns):
# Column Dtype
--- ------ -----
0 id int64
1 text object
2 split object
3 filename object
4 corpus object
dtypes: int64(1), object(4)
memory usage: 96.2+ MB
time: 3.92 ms (started: 2022-11-17 00:04:18 +00:00)
Basic statistics#
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import seaborn as sns
nltk.download("punkt")
# Character counts
data["num_chars"] = data["text"].map(lambda x: len(x))
# Word counts
data["num_words"] = data["text"].map(lambda x: len(x.split()))
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
time: 16.6 s (started: 2022-11-17 00:05:26 +00:00)
# format pandas display options
import pandas as pd
pd.options.display.float_format = "{:.0f}".format
# min, max, mean, median, iqr, std
data[["num_chars", "num_words"]].describe()
num_chars | num_words | |
---|---|---|
count | 2522593 | 2522593 |
mean | 541 | 93 |
std | 2308 | 387 |
min | 0 | 0 |
25% | 0 | 0 |
50% | 0 | 0 |
75% | 213 | 40 |
max | 264937 | 45338 |
time: 182 ms (started: 2022-11-17 00:05:51 +00:00)
# filter out outliers
data_filtered = data[data.num_words.between(10, 250)]
time: 72.8 ms (started: 2022-11-17 00:06:07 +00:00)
# save filtered data
eKonf.save_data(data_filtered, "wiki_filtered.parquet", project_dir + "/data")
INFO:ekorpkit.io.file:Saving dataframe to /content/drive/MyDrive/workspace/projects/ekorpkit-book/data/wiki_filtered.parquet
time: 43.4 s (started: 2022-11-17 00:06:22 +00:00)
data_filtered[["num_chars", "num_words"]].describe()
num_chars | num_words | |
---|---|---|
count | 686161 | 686161 |
mean | 419 | 75 |
std | 373 | 63 |
min | 25 | 10 |
25% | 124 | 23 |
50% | 283 | 52 |
75% | 612 | 112 |
max | 3432 | 250 |
time: 47.9 ms (started: 2022-11-17 00:07:07 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="num_chars", hue="corpus", kde=True)
<AxesSubplot:xlabel='num_chars', ylabel='Count'>
time: 5.31 s (started: 2022-11-17 00:07:16 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="num_words", hue="corpus", kde=True)
<AxesSubplot:xlabel='num_words', ylabel='Count'>
time: 3.56 s (started: 2022-11-17 00:07:30 +00:00)
# Sentence counts
data_filtered["num_sents"] = data_filtered["text"].map(lambda x: len(sent_tokenize(x)))
time: 1min 23s (started: 2022-11-17 00:07:52 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="num_sents", hue="corpus", kde=True)
<AxesSubplot:xlabel='num_sents', ylabel='Count'>
time: 7.1 s (started: 2022-11-17 00:09:23 +00:00)
# Average number of characters per word
data_filtered["avg_num_chars"] = data_filtered["num_chars"] / data_filtered["num_words"]
time: 3.14 ms (started: 2022-11-17 00:09:50 +00:00)
# Histogram
sns.histplot(data = data_filtered, x = "avg_num_chars", hue = "corpus", kde = True)
<AxesSubplot:xlabel='avg_num_chars', ylabel='Count'>
time: 15.2 s (started: 2022-11-17 00:09:59 +00:00)
# Average number of words per sentence
data_filtered["avg_num_words"] = data_filtered["num_words"] / data_filtered["num_sents"]
time: 3.35 ms (started: 2022-11-17 00:10:20 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="avg_num_words", hue="corpus", kde=True)
<AxesSubplot:xlabel='avg_num_words', ylabel='Count'>
time: 9.77 s (started: 2022-11-17 00:10:24 +00:00)
Term Frequency Analysis#
# lower case
data_filtered["text"] = data_filtered["text"].map(lambda x: x.lower())
time: 1.05 s (started: 2022-11-17 00:11:01 +00:00)
data_filtered.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 686161 entries, 1 to 2522592
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 686161 non-null int64
1 text 686161 non-null object
2 split 686161 non-null object
3 filename 686161 non-null object
4 corpus 686161 non-null object
5 num_chars 686161 non-null int64
6 num_words 686161 non-null int64
7 num_sents 686161 non-null int64
8 avg_num_chars 686161 non-null float64
9 avg_num_words 686161 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 73.7+ MB
time: 180 ms (started: 2022-11-17 00:11:09 +00:00)
# get all words of english and korean corpus
words_en = sum(
data_filtered[data_filtered.corpus == "enwiki_sampled"].sample(frac=0.05)["text"]
.str.split()
.tolist(),
[],
)
time: 37.2 s (started: 2022-11-17 00:25:33 +00:00)
words_ko = sum(
data_filtered[data_filtered.corpus == "kowiki"].sample(frac=0.05)["text"]
.str.split()
.tolist(),
[],
)
time: 2min 57s (started: 2022-11-17 00:26:10 +00:00)
words_bn = sum(
data_filtered[data_filtered.corpus == "bnwiki"].sample(frac=0.05)["text"]
.str.split()
.tolist(),
[],
)
time: 10.3 s (started: 2022-11-17 00:29:07 +00:00)
from collections import Counter
counter_en = Counter([w for w in words_en if len(w) > 1])
counter_ko = Counter([w for w in words_ko if len(w) > 1])
counter_bn = Counter([w for w in words_bn if len(w) > 1])
time: 600 ms (started: 2022-11-17 00:29:17 +00:00)
# top 10 words in English corpus
counter_en.most_common(10)
[('the', 59104),
('in', 29667),
('of', 29325),
('and', 22730),
('is', 15395),
('was', 13092),
('to', 11893),
('on', 7179),
('by', 6649),
('he', 6566)]
time: 18.6 ms (started: 2022-11-17 00:29:18 +00:00)
# top 10 words in Korean corpus
counter_ko.most_common(10)
[('있다.', 12127),
('있는', 4757),
(')는', 2526),
('한다.', 2231),
('또는', 2115),
('대한민국의', 2048),
('이후', 2032),
('그는', 2007),
('되었다.', 1895),
('함께', 1764)]
time: 51.7 ms (started: 2022-11-17 00:29:18 +00:00)
# top 10 words in Bengali corpus
counter_bn.most_common(10)
[('এবং', 6827),
('তিনি', 4330),
('একটি', 3915),
('সালে', 3190),
('এই', 3122),
('থেকে', 2805),
('হয়।', 2594),
('তার', 2426),
('করা', 2318),
('এর', 2252)]
time: 13.1 ms (started: 2022-11-17 00:29:18 +00:00)
# Plot 20 most common words after removing top 20 common words
import matplotlib.pyplot as plt
most_common_en = counter_en.most_common()[20:40]
most_common_ko = counter_ko.most_common()[20:40]
most_common_bn = counter_bn.most_common()[20:40]
# Change the font to Korean font
# You may need to install the font first
plt.rcParams["font.family"] = "NanumGothic"
plt.figure(figsize=(20, 6))
plt.subplot(1, 2, 1)
plt.bar(range(20), [x[1] for x in most_common_en])
plt.xticks(range(20), [x[0] for x in most_common_en], rotation=90)
plt.title("English")
plt.subplot(1, 2, 2)
plt.bar(range(20), [x[1] for x in most_common_ko])
plt.xticks(range(20), [x[0] for x in most_common_ko], rotation=90)
plt.title("Korean")
plt.show()
time: 533 ms (started: 2022-11-17 00:29:18 +00:00)
Most Frequent N-Grams#
from sklearn.feature_extraction.text import CountVectorizer
def get_ngrams(data):
# Initialize CountVectorizer
vec = CountVectorizer(ngram_range=(2, 2))
# Fit and transform
ngram_counts = vec.fit_transform(data)
# Get the n-gram counts
ngram_counts = ngram_counts.sum(axis=0).A1
# Create a DataFrame
ngram_counts_df = pd.DataFrame(
sorted(zip(vec.get_feature_names(), ngram_counts), key=lambda x: x[1], reverse=True),
columns=["ngram", "count"],
)
return ngram_counts_df
time: 577 µs (started: 2022-11-17 00:29:19 +00:00)
ngram_counts_df_en = get_ngrams(words_en)
ngram_counts_df_en
ngram | count | |
---|---|---|
0 | br gt | 248 |
1 | lt br | 248 |
2 | lt onlyinclude | 95 |
3 | onlyinclude gt | 95 |
4 | first class | 88 |
... | ... | ... |
8088 | മക കൾ | 1 |
8089 | មស រឡ | 1 |
8090 | ヨーヘン パイパー戦記 | 1 |
8091 | 史記 volume | 1 |
8092 | 梁肯堂 1717 | 1 |
8093 rows × 2 columns
time: 2.31 s (started: 2022-11-17 00:29:19 +00:00)
ngram_counts_df_ko = get_ngrams(words_ko)
ngram_counts_df_ko
ngram | count | |
---|---|---|
0 | 기준 이다 | 182 |
1 | lt br | 180 |
2 | br gt | 177 |
3 | 사망하였다 사인 | 108 |
4 | gt lt | 76 |
... | ... | ... |
50086 | 힘러 혼전성 | 1 |
50087 | 힘입어 2017년 | 1 |
50088 | 힘줄 tendon | 1 |
50089 | 힘줄끈 chordae | 1 |
50090 | 힙합 hip | 1 |
50091 rows × 2 columns
time: 3.9 s (started: 2022-11-17 00:29:21 +00:00)
ngram_counts_df_bn = get_ngrams(words_bn)
ngram_counts_df_bn
ngram | count | |
---|---|---|
0 | বর তম | 816 |
1 | বব লয | 760 |
2 | পর যন | 720 |
3 | উত তর | 688 |
4 | অভ নয | 669 |
... | ... | ... |
9806 | ৰবৰ তন | 1 |
9807 | ஏற தழ | 1 |
9808 | சல கட | 1 |
9809 | தழ வல | 1 |
9810 | 黑风洞 峇都喼 | 1 |
9811 rows × 2 columns
time: 1.35 s (started: 2022-11-17 00:29:25 +00:00)
# plot 20 most common n-grams
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.bar(range(20), ngram_counts_df_en["count"][:20])
plt.xticks(range(20), ngram_counts_df_en["ngram"][:20], rotation=90)
plt.title("English")
plt.subplot(1, 2, 2)
plt.bar(range(20), ngram_counts_df_ko["count"][:20])
plt.xticks(range(20), ngram_counts_df_ko["ngram"][:20], rotation=90)
plt.title("Korean")
plt.show()
time: 376 ms (started: 2022-11-17 00:29:26 +00:00)
Word Cloud#
from ekorpkit.visualize.base import get_plot_font
fontname, fontpath = get_plot_font()
fontname, fontpath
('NanumGothic', '/usr/share/fonts/truetype/nanum/NanumGothic.ttf')
time: 2.47 ms (started: 2022-11-17 00:33:05 +00:00)
# Function for generating word clouds
from wordcloud import WordCloud
def generate_wordcloud(
data,
title,
width=400,
height=200,
max_words=150,
figsize=(10, 5),
colormap="Dark2",
background_color="white",
):
wc = WordCloud(
width=width,
height=height,
max_words=max_words,
colormap=colormap,
background_color=background_color,
font_path=fontpath,
).generate_from_frequencies(data)
plt.figure(figsize=figsize)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title(title, fontsize=13)
plt.show()
time: 677 µs (started: 2022-11-17 00:33:57 +00:00)
# Generate word clouds
# English
generate_wordcloud(
dict(counter_en.most_common()),
"English",
)
# Korean
generate_wordcloud(
dict(counter_ko.most_common()),
"Korean",
)
time: 1.31 s (started: 2022-11-17 00:34:26 +00:00)
# Generate word clouds for n-grams
# English
generate_wordcloud(
dict(ngram_counts_df_en.set_index("ngram")["count"]),
"English",
)
# Korean
generate_wordcloud(
dict(ngram_counts_df_ko.set_index("ngram")["count"]),
"Korean",
)
time: 1.27 s (started: 2022-11-17 00:34:06 +00:00)