Lexicon-based Sentiment Analysis#
import logging
from ekorpkit import eKonf
logging.basicConfig(level=logging.INFO)
print(eKonf.__version__)
0.1.33+11.g6ef57fc.dirty
Instantiating a sentiment analyser class#
cfg = eKonf.compose("model/sentiment=lm")
# cfg.verbose = True
# eKonf.print(cfg)
lmsa = eKonf.instantiate(cfg)
tokens = ["Fraud", "Good", "Good", "Good", "Sound", "uncertain", "beat", "wrong"]
lmsa.predict(tokens)
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/LM.parquet
INFO:ekorpkit.models.ngram.ngram:loaded 58142 candidates
{'num_tokens': 8,
'polarity': -0.9999990000010001,
'polarity_label': 'negative',
'uncertainty': 0.125001}
text = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
features = lmsa.analyze(text, features=['Negative', 'Positive'])
eKonf.print(features)
{'advantage': {'Negative': 0, 'Positive': 2009, 'count': 1},
'and': {'Negative': 0, 'Positive': 0, 'count': 1},
'beyond': {'Negative': 0, 'Positive': 0, 'count': 1},
'capability': {'Negative': 0, 'Positive': 0, 'count': 1},
'competitive': {'Negative': 0, 'Positive': 0, 'count': 1},
'comply': {'Negative': 0, 'Positive': 0, 'count': 1},
'customer': {'Negative': 0, 'Positive': 0, 'count': 1},
'have': {'Negative': 0, 'Positive': 0, 'count': 1},
'improved': {'Negative': 0, 'Positive': 2009, 'count': 1},
'now': {'Negative': 0, 'Positive': 0, 'count': 1},
'other': {'Negative': 0, 'Positive': 0, 'count': 1},
'provide': {'Negative': 0, 'Positive': 0, 'count': 1},
'recall': {'Negative': 2009, 'Positive': 0, 'count': 1},
'requirement': {'Negative': 0, 'Positive': 0, 'count': 1},
'streamlined': {'Negative': 0, 'Positive': 0, 'count': 1},
'the': {'Negative': 0, 'Positive': 0, 'count': 1},
'them': {'Negative': 0, 'Positive': 0, 'count': 1},
'to': {'Negative': 0, 'Positive': 0, 'count': 1},
'traceability': {'Negative': 0, 'Positive': 0, 'count': 1},
'voice': {'Negative': 0, 'Positive': 0, 'count': 1},
'way': {'Negative': 0, 'Positive': 0, 'count': 1},
'with': {'Negative': 0, 'Positive': 0, 'count': 2}}
text = "Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005."
print(lmsa.predict(text))
{'num_tokens': 22, 'polarity': -0.9999990000010001, 'polarity_label': 'negative', 'uncertainty': 1e-06}
cfg = eKonf.compose('model/sentiment=hiv4')
hivsa = eKonf.instantiate(cfg)
tokens = ["Fraud", "Good","Good","Good", "Sound", "uncertain", "beat", "wrong", "legal"]
hivsa.predict(tokens)
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.base:Calling load_candidates
INFO:ekorpkit.io.file:Processing [1] files from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet']
INFO:ekorpkit.io.file:Loading 1 dataframes from ['/workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet']
INFO:ekorpkit.io.file:Loading data from /workspace/projects/ekorpkit/ekorpkit/resources/lexicons/HIV-4.parquet
INFO:ekorpkit.models.ngram.ngram:loaded 11787 candidates
{'num_tokens': 9,
'polarity': 0.9999990000010001,
'polarity_label': 'positive',
'legal': 0.1111121111111111}
text = "Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage."
features = eKonf.print(hivsa.analyze(text, features=['Negativ', 'Positiv']))
print(hivsa.predict(text))
text = "Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005."
print(hivsa.predict(text))
{'a': {'Negativ': None, 'Positiv': None, 'count': 2},
'advantage': {'Negativ': None, 'Positiv': 'Positiv', 'count': 1},
'and': {'Negativ': None, 'Positiv': None, 'count': 1},
'beyond': {'Negativ': None, 'Positiv': None, 'count': 1},
'competitive': {'Negativ': 'Negativ', 'Positiv': None, 'count': 1},
'comply': {'Negativ': None, 'Positiv': None, 'count': 1},
'now': {'Negativ': None, 'Positiv': None, 'count': 1},
'the': {'Negativ': None, 'Positiv': None, 'count': 1},
'them': {'Negativ': None, 'Positiv': None, 'count': 1},
'with': {'Negativ': None, 'Positiv': None, 'count': 2}}
{'num_tokens': 28, 'polarity': 0.0, 'polarity_label': 'neutral', 'legal': 1e-06}
{'num_tokens': 22, 'polarity': -0.9999990000010001, 'polarity_label': 'negative', 'legal': 1e-06}
doc = [
"Beyond the improved voice capabilities, customers now have a streamlined way to comply with recalls and other traceability requirements, providing them with a competitive advantage.",
"Operating loss amounted to EUR 0.7 mn compared to a profit of EUR 0.8 mn in the second quarter of 2005.",
]
hivsa.predict_article('\n'.join(doc))
{'num_examples': 2,
'polarity': -0.499999750000125,
'polarity_label': 'negative'}