Lab 2: EDA on Corpora#

Prepare the environment#

%pip install --pre ekorpkit[dataset]
%config InlineBackend.figure_format='retina'
%load_ext autotime

from ekorpkit import eKonf

eKonf.setLogger("INFO")
print("version:", eKonf.__version__)

is_colab = eKonf.is_colab()
print("is colab?", is_colab)
if is_colab:
    eKonf.mount_google_drive()
workspace_dir = "/content/drive/MyDrive/workspace"
project_name = "ekorpkit-book"
project_dir = eKonf.set_workspace(workspace=workspace_dir, project=project_name)
print("project_dir:", project_dir)
INFO:ekorpkit.utils.notebook:Google Colab not detected.
INFO:ekorpkit.base:Setting EKORPKIT_WORKSPACE_ROOT to /content/drive/MyDrive/workspace
INFO:ekorpkit.base:Setting EKORPKIT_PROJECT to ekorpkit-book
INFO:ekorpkit.base:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
version: 0.1.40.post0.dev21
is colab? False
project_dir: /content/drive/MyDrive/workspace/projects/ekorpkit-book
time: 1.39 s (started: 2022-11-16 23:58:16 +00:00)

Load the saved corpora#

data = eKonf.load_data("wiki_corpus.parquet", project_dir + "/data")
INFO:ekorpkit.io.file:Processing [1] files from ['wiki_corpus.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/content/drive/MyDrive/workspace/projects/ekorpkit-book/data/wiki_corpus.parquet']
INFO:ekorpkit.io.file:Loading data from /content/drive/MyDrive/workspace/projects/ekorpkit-book/data/wiki_corpus.parquet
time: 14 s (started: 2022-11-17 00:04:02 +00:00)
data.head()
id text split filename corpus
0 4915400 train wiki_92 enwiki_sampled
1 7644961 Anaissini is a tribe of click beetles in the f... train wiki_49 enwiki_sampled
2 6658552 The Vicky Metcalf Award for Literature for You... train wiki_24 enwiki_sampled
3 16385169 Shri Shivabalayogi Maharaj (24 January 1935 – ... train wiki_36 enwiki_sampled
4 11081255 Eylex Films Pvt is a chain of multiplex and si... train wiki_94 enwiki_sampled
time: 10.2 ms (started: 2022-11-17 00:04:17 +00:00)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522593 entries, 0 to 2522592
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        int64 
 1   text      object
 2   split     object
 3   filename  object
 4   corpus    object
dtypes: int64(1), object(4)
memory usage: 96.2+ MB
time: 3.92 ms (started: 2022-11-17 00:04:18 +00:00)

Basic statistics#

import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import seaborn as sns

nltk.download("punkt")

# Character counts
data["num_chars"] = data["text"].map(lambda x: len(x))
# Word counts
data["num_words"] = data["text"].map(lambda x: len(x.split()))
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
time: 16.6 s (started: 2022-11-17 00:05:26 +00:00)
# format pandas display options
import pandas as pd

pd.options.display.float_format = "{:.0f}".format

# min, max, mean, median, iqr, std
data[["num_chars", "num_words"]].describe()
num_chars num_words
count 2522593 2522593
mean 541 93
std 2308 387
min 0 0
25% 0 0
50% 0 0
75% 213 40
max 264937 45338
time: 182 ms (started: 2022-11-17 00:05:51 +00:00)
# filter out outliers

data_filtered = data[data.num_words.between(10, 250)]
time: 72.8 ms (started: 2022-11-17 00:06:07 +00:00)
# save filtered data

eKonf.save_data(data_filtered, "wiki_filtered.parquet", project_dir + "/data")
INFO:ekorpkit.io.file:Saving dataframe to /content/drive/MyDrive/workspace/projects/ekorpkit-book/data/wiki_filtered.parquet
time: 43.4 s (started: 2022-11-17 00:06:22 +00:00)
data_filtered[["num_chars", "num_words"]].describe()
num_chars num_words
count 686161 686161
mean 419 75
std 373 63
min 25 10
25% 124 23
50% 283 52
75% 612 112
max 3432 250
time: 47.9 ms (started: 2022-11-17 00:07:07 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="num_chars", hue="corpus", kde=True)
<AxesSubplot:xlabel='num_chars', ylabel='Count'>
../../../_images/3e990e6ad362bdd24d2a0d0a8ee5e8a1732547f327025d818c0fe2fd806ac245.png
time: 5.31 s (started: 2022-11-17 00:07:16 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="num_words", hue="corpus", kde=True)
<AxesSubplot:xlabel='num_words', ylabel='Count'>
../../../_images/eb78a781dd70976fc7029abffc028ae12731b275720ab7e49fb8a24996bfe9c7.png
time: 3.56 s (started: 2022-11-17 00:07:30 +00:00)
# Sentence counts
data_filtered["num_sents"] = data_filtered["text"].map(lambda x: len(sent_tokenize(x)))
time: 1min 23s (started: 2022-11-17 00:07:52 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="num_sents", hue="corpus", kde=True)
<AxesSubplot:xlabel='num_sents', ylabel='Count'>
../../../_images/0a00c8487ea71fd742851b4b44461163cf0aa056cc88753eedc711def818c295.png
time: 7.1 s (started: 2022-11-17 00:09:23 +00:00)
# Average number of characters per word
data_filtered["avg_num_chars"] = data_filtered["num_chars"] / data_filtered["num_words"]
time: 3.14 ms (started: 2022-11-17 00:09:50 +00:00)
# Histogram
sns.histplot(data = data_filtered, x = "avg_num_chars", hue = "corpus", kde = True)
<AxesSubplot:xlabel='avg_num_chars', ylabel='Count'>
../../../_images/56e74519a4f98877e0235f294b727f4425feb927108bf235ccecaf3dc5ff1d52.png
time: 15.2 s (started: 2022-11-17 00:09:59 +00:00)
# Average number of words per sentence
data_filtered["avg_num_words"] = data_filtered["num_words"] / data_filtered["num_sents"]
time: 3.35 ms (started: 2022-11-17 00:10:20 +00:00)
# Histogram
sns.histplot(data=data_filtered, x="avg_num_words", hue="corpus", kde=True)
<AxesSubplot:xlabel='avg_num_words', ylabel='Count'>
../../../_images/830f3de2b02d9c8fae3d49b911ad823822582ed12be805bd46a18c51680189a7.png
time: 9.77 s (started: 2022-11-17 00:10:24 +00:00)

Term Frequency Analysis#

# lower case
data_filtered["text"] = data_filtered["text"].map(lambda x: x.lower())
time: 1.05 s (started: 2022-11-17 00:11:01 +00:00)
data_filtered.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 686161 entries, 1 to 2522592
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             686161 non-null  int64  
 1   text           686161 non-null  object 
 2   split          686161 non-null  object 
 3   filename       686161 non-null  object 
 4   corpus         686161 non-null  object 
 5   num_chars      686161 non-null  int64  
 6   num_words      686161 non-null  int64  
 7   num_sents      686161 non-null  int64  
 8   avg_num_chars  686161 non-null  float64
 9   avg_num_words  686161 non-null  float64
dtypes: float64(2), int64(4), object(4)
memory usage: 73.7+ MB
time: 180 ms (started: 2022-11-17 00:11:09 +00:00)
# get all words of english and korean corpus

words_en = sum(
    data_filtered[data_filtered.corpus == "enwiki_sampled"].sample(frac=0.05)["text"]
    .str.split()
    .tolist(),
    [],
)
time: 37.2 s (started: 2022-11-17 00:25:33 +00:00)
words_ko = sum(
    data_filtered[data_filtered.corpus == "kowiki"].sample(frac=0.05)["text"]
    .str.split()
    .tolist(),
    [],
)
time: 2min 57s (started: 2022-11-17 00:26:10 +00:00)
words_bn = sum(
    data_filtered[data_filtered.corpus == "bnwiki"].sample(frac=0.05)["text"]
    .str.split()
    .tolist(),
    [],
)
time: 10.3 s (started: 2022-11-17 00:29:07 +00:00)
from collections import Counter

counter_en = Counter([w for w in words_en if len(w) > 1])
counter_ko = Counter([w for w in words_ko if len(w) > 1])
counter_bn = Counter([w for w in words_bn if len(w) > 1])
time: 600 ms (started: 2022-11-17 00:29:17 +00:00)
# top 10 words in English corpus

counter_en.most_common(10)
[('the', 59104),
 ('in', 29667),
 ('of', 29325),
 ('and', 22730),
 ('is', 15395),
 ('was', 13092),
 ('to', 11893),
 ('on', 7179),
 ('by', 6649),
 ('he', 6566)]
time: 18.6 ms (started: 2022-11-17 00:29:18 +00:00)
# top 10 words in Korean corpus

counter_ko.most_common(10)
[('있다.', 12127),
 ('있는', 4757),
 (')는', 2526),
 ('한다.', 2231),
 ('또는', 2115),
 ('대한민국의', 2048),
 ('이후', 2032),
 ('그는', 2007),
 ('되었다.', 1895),
 ('함께', 1764)]
time: 51.7 ms (started: 2022-11-17 00:29:18 +00:00)
# top 10 words in Bengali corpus

counter_bn.most_common(10)
[('এবং', 6827),
 ('তিনি', 4330),
 ('একটি', 3915),
 ('সালে', 3190),
 ('এই', 3122),
 ('থেকে', 2805),
 ('হয়।', 2594),
 ('তার', 2426),
 ('করা', 2318),
 ('এর', 2252)]
time: 13.1 ms (started: 2022-11-17 00:29:18 +00:00)
# Plot 20 most common words after removing top 20 common words
import matplotlib.pyplot as plt

most_common_en = counter_en.most_common()[20:40]
most_common_ko = counter_ko.most_common()[20:40]
most_common_bn = counter_bn.most_common()[20:40]


# Change the font to Korean font
# You may need to install the font first
plt.rcParams["font.family"] = "NanumGothic"

plt.figure(figsize=(20, 6))
plt.subplot(1, 2, 1)
plt.bar(range(20), [x[1] for x in most_common_en])
plt.xticks(range(20), [x[0] for x in most_common_en], rotation=90)
plt.title("English")

plt.subplot(1, 2, 2)
plt.bar(range(20), [x[1] for x in most_common_ko])
plt.xticks(range(20), [x[0] for x in most_common_ko], rotation=90)
plt.title("Korean")

plt.show()
../../../_images/803afd07eb4c73475bf53e34b849741144d3f25ef9292e6138a3c59924b796be.png
time: 533 ms (started: 2022-11-17 00:29:18 +00:00)

Most Frequent N-Grams#

from sklearn.feature_extraction.text import CountVectorizer


def get_ngrams(data):
    # Initialize CountVectorizer
    vec = CountVectorizer(ngram_range=(2, 2))

    # Fit and transform
    ngram_counts = vec.fit_transform(data)

    # Get the n-gram counts
    ngram_counts = ngram_counts.sum(axis=0).A1

    # Create a DataFrame
    ngram_counts_df = pd.DataFrame(
        sorted(zip(vec.get_feature_names(), ngram_counts), key=lambda x: x[1], reverse=True),
        columns=["ngram", "count"],
    )
    return ngram_counts_df
time: 577 µs (started: 2022-11-17 00:29:19 +00:00)
ngram_counts_df_en = get_ngrams(words_en)
ngram_counts_df_en
ngram count
0 br gt 248
1 lt br 248
2 lt onlyinclude 95
3 onlyinclude gt 95
4 first class 88
... ... ...
8088 മക കൾ 1
8089 មស រឡ 1
8090 ヨーヘン パイパー戦記 1
8091 史記 volume 1
8092 梁肯堂 1717 1

8093 rows × 2 columns

time: 2.31 s (started: 2022-11-17 00:29:19 +00:00)
ngram_counts_df_ko = get_ngrams(words_ko)
ngram_counts_df_ko
ngram count
0 기준 이다 182
1 lt br 180
2 br gt 177
3 사망하였다 사인 108
4 gt lt 76
... ... ...
50086 힘러 혼전성 1
50087 힘입어 2017년 1
50088 힘줄 tendon 1
50089 힘줄끈 chordae 1
50090 힙합 hip 1

50091 rows × 2 columns

time: 3.9 s (started: 2022-11-17 00:29:21 +00:00)
ngram_counts_df_bn = get_ngrams(words_bn)
ngram_counts_df_bn
ngram count
0 বর তম 816
1 বব লয 760
2 পর যন 720
3 উত তর 688
4 অভ নয 669
... ... ...
9806 ৰবৰ তন 1
9807 ஏற தழ 1
9808 சல கட 1
9809 தழ வல 1
9810 黑风洞 峇都喼 1

9811 rows × 2 columns

time: 1.35 s (started: 2022-11-17 00:29:25 +00:00)
# plot 20 most common n-grams

plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
plt.bar(range(20), ngram_counts_df_en["count"][:20])
plt.xticks(range(20), ngram_counts_df_en["ngram"][:20], rotation=90)
plt.title("English")

plt.subplot(1, 2, 2)
plt.bar(range(20), ngram_counts_df_ko["count"][:20])
plt.xticks(range(20), ngram_counts_df_ko["ngram"][:20], rotation=90)
plt.title("Korean")

plt.show()
../../../_images/4584f980b49ee0f3d18a8f9329eddfb88a13ab1053dfbe3fc027e93f8093678a.png
time: 376 ms (started: 2022-11-17 00:29:26 +00:00)

Word Cloud#

from ekorpkit.visualize.base import get_plot_font

fontname, fontpath = get_plot_font()
fontname, fontpath
('NanumGothic', '/usr/share/fonts/truetype/nanum/NanumGothic.ttf')
time: 2.47 ms (started: 2022-11-17 00:33:05 +00:00)
# Function for generating word clouds
from wordcloud import WordCloud


def generate_wordcloud(
    data,
    title,
    width=400,
    height=200,
    max_words=150,
    figsize=(10, 5),
    colormap="Dark2",
    background_color="white",
):
    wc = WordCloud(
        width=width,
        height=height,
        max_words=max_words,
        colormap=colormap,
        background_color=background_color,
        font_path=fontpath,
    ).generate_from_frequencies(data)
    plt.figure(figsize=figsize)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=13)
    plt.show()
time: 677 µs (started: 2022-11-17 00:33:57 +00:00)
# Generate word clouds

# English
generate_wordcloud(
    dict(counter_en.most_common()),
    "English",
)

# Korean
generate_wordcloud(
    dict(counter_ko.most_common()),
    "Korean",
)
../../../_images/1fae3027624d28ef73e664427d91a626de7c6cb63713024977f4f2c7359b9782.png ../../../_images/a28bb06a169c4a1eebbd4619fc01870edd642ea5498194342e43f4fc4ebed0c5.png
time: 1.31 s (started: 2022-11-17 00:34:26 +00:00)
# Generate word clouds for n-grams

# English
generate_wordcloud(
    dict(ngram_counts_df_en.set_index("ngram")["count"]),
    "English",
)

# Korean
generate_wordcloud(
    dict(ngram_counts_df_ko.set_index("ngram")["count"]),
    "Korean",
)
../../../_images/aeba86276c07ead46a7a66915b6b7713e7bf7dbe21f28753efe1a9c5d3c59940.png ../../../_images/220f7d650391ec8bbd2bf055a3e41be16d29395278445f5556bdcb6ddc48921b.png
time: 1.27 s (started: 2022-11-17 00:34:06 +00:00)