# N-Gram model for unigram lexicon features

In [1]:
from ekorpkit import eKonf

eKonf.setLogger("WARNING")
print("version:", eKonf.__version__)
print("is notebook?", eKonf.is_notebook())
print("is colab?", eKonf.is_colab())
print("evironment varialbles:")
eKonf.print(eKonf.env().dict())

INFO:ekorpkit.base:IPython version: (6, 9, 0), client: jupyter_client
INFO:ekorpkit.base:Google Colab not detected.


version: 0.1.35+1.gbeed9e1
is notebook? True
is colab? False
evironment varialbles:
{'CUDA_DEVICE_ORDER': None,
 'CUDA_VISIBLE_DEVICES': None,
 'EKORPKIT_CONFIG_DIR': '/workspace/projects/ekorpkit-book/config',
 'EKORPKIT_DATA_DIR': None,
 'EKORPKIT_PROJECT': 'ekorpkit-book',
 'EKORPKIT_WORKSPACE_ROOT': '/workspace',
 'KMP_DUPLICATE_LIB_OK': 'TRUE',
 'NUM_WORKERS': 230}


## Load a ngram model with LM lexicon scores

In [2]:
ngram_cfg = eKonf.compose("model/ngram=lm")
ngram_cfg.verbose = True
ngram_cfg.auto.load = True
ngram = eKonf.instantiate(ngram_cfg)

INFO:ekorpkit.base:instantiating ekorpkit.models.ngram.ngram.Ngrams...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet
INFO:ekorpkit.io.file: >> elapsed time to load data: 0:00:00.039804
INFO:ekorpkit.models.ngram.ngram:loaded 58142 candidates


In [3]:
print(f"Number of candidates: {len(ngram.candidates)}")

Number of candidates: 58142


In [4]:
sentence = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
tokens = ngram.tokenize(sentence)
print(tokens)

['beyond', 'the', 'improved', 'voice', 'capability', ',', 'customer', 'now', 'have', 'a', 'streamlined', 'way', 'to', 'comply', 'with', 'recall', 'and', 'other', 'traceability', 'requirement', ',', 'provide', 'them', 'with', 'a', 'competitive', 'advantage', '.']


In [5]:
_features = ngram.find_features(sentence)
_features

INFO:ekorpkit.models.ngram.ngram:found 22 ngrams


{'beyond': {'Negative': 0,
  'Positive': 0,
  'Uncertainty': 0,
  'Litigious': 0,
  'Constraining': 0,
  'Superfluous': 0,
  'Interesting': 0,
  'Modal': 0,
  'count': 1},
 'the': {'Negative': 0,
  'Positive': 0,
  'Uncertainty': 0,
  'Litigious': 0,
  'Constraining': 0,
  'Superfluous': 0,
  'Interesting': 0,
  'Modal': 0,
  'count': 1},
 'improved': {'Negative': 0,
  'Positive': 2009,
  'Uncertainty': 0,
  'Litigious': 0,
  'Constraining': 0,
  'Superfluous': 0,
  'Interesting': 0,
  'Modal': 0,
  'count': 1},
 'voice': {'Negative': 0,
  'Positive': 0,
  'Uncertainty': 0,
  'Litigious': 0,
  'Constraining': 0,
  'Superfluous': 0,
  'Interesting': 0,
  'Modal': 0,
  'count': 1},
 'capability': {'Negative': 0,
  'Positive': 0,
  'Uncertainty': 0,
  'Litigious': 0,
  'Constraining': 0,
  'Superfluous': 0,
  'Interesting': 0,
  'Modal': 0,
  'count': 1},
 'customer': {'Negative': 0,
  'Positive': 0,
  'Uncertainty': 0,
  'Litigious': 0,
  'Constraining': 0,
  'Superfluous': 0,
  'Interes

## Load a ngram model with HIV4 lexicon scores

In [6]:
ngram_cfg = eKonf.compose(config_group="model/ngram=hiv4")
ngram_cfg.verbose = True
ngram_cfg.auto.load = True
ngram = eKonf.instantiate(ngram_cfg)

INFO:ekorpkit.base:instantiating ekorpkit.models.ngram.ngram.Ngrams...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet
INFO:ekorpkit.io.file: >> elapsed time to load data: 0:00:00.040765
INFO:ekorpkit.models.ngram.ngram:loaded 11787 candidates


In [7]:
sentence = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
tokens = ngram.tokenize(sentence)
print(tokens)

['beyond', 'the', 'improved', 'voice', 'capabilities', ',', 'customers', 'now', 'have', 'a', 'streamlined', 'way', 'to', 'comply', 'with', 'recalls', 'and', 'other', 'traceability', 'requirements', ',', 'providing', 'them', 'with', 'a', 'competitive', 'advantage', '.']


In [9]:
_features = ngram.find_features(sentence)
_features

INFO:ekorpkit.models.ngram.ngram:found 10 ngrams


{'beyond': {'Positiv': None,
  'Negativ': None,
  'Pstv': None,
  'Affil': None,
  'Ngtv': None,
  'Hostile': None,
  'Strong': None,
  'Power': None,
  'Weak': None,
  'Submit': None,
  'Active': None,
  'Passive': None,
  'Pleasur': None,
  'Pain': None,
  'Feel': None,
  'Arousal': None,
  'EMOT': None,
  'Virtue': None,
  'Vice': None,
  'Ovrst': None,
  'Undrst': None,
  'Academ': None,
  'Doctrin': None,
  'Exch': None,
  'ECON': None,
  'Exprsv': None,
  'Legal': None,
  'Milit': None,
  'POLIT': None,
  'Relig': None,
  'Role': None,
  'COLL': None,
  'Work': None,
  'Ritual': None,
  'SocRel': None,
  'Race': None,
  'MALE': None,
  'Female': None,
  'Nonadlt': None,
  'HU': None,
  'ANI': None,
  'PLACE': None,
  'Social': None,
  'Region': None,
  'Route': None,
  'Aquatic': None,
  'Land': None,
  'Sky': None,
  'Object': None,
  'Tool': None,
  'Food': None,
  'Vehicle': None,
  'BldgPt': None,
  'ComnObj': None,
  'NatObj': None,
  'BodyPt': None,
  'ComForm': None,
  'CO