This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task3-nlp/da_detector.py
surtur 1f45489cab
replace newlines with spaces
whereas before they were simply discarded
2021-12-20 05:24:08 +01:00

262 lines
7.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""
class da_detector:
def __init__(self, langs_to_check: list = ["sk", "en"]):
# langs to check
# to be picked from ["cz", "sk", "de", "en", "fr"]
self.da_ngrams = []
self.what_grams = 3
self.how_many = 30
self.freqs_folder = "./freqs/"
if not isinstance(langs_to_check, list):
raise TypeError("not a list, bailing")
if (len(langs_to_check) < 2):
raise ValueError("too few languages specified")
self.langs_to_check = ["sk", "en"]
def rm_interpunction(self, data):
from string import punctuation
try:
data = data.translate(str.maketrans('', '', punctuation))
except Exception as e:
raise e
return data
def rm_newlines(self, data):
try:
data = data.replace('\n', ' ')
except Exception as e:
raise e
return data
def rm_digits(self, data):
from string import digits
try:
data = data.translate(str.maketrans('', '', digits))
except Exception as e:
raise e
return data
def text_clean_up(self, data: str):
data = self.rm_interpunction(data)
data = self.rm_digits(data)
data = self.rm_newlines(data)
return data
def parse_freqs(self, path: str):
import json
fullpath = self.freqs_folder + path
try:
with open(fullpath, 'r') as f:
j_data = f.read()
except Exception as e:
raise e
try:
obj = json.loads(j_data)
except Exception as e:
raise e
return obj
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
from random import randint
if not isinstance(what_grams, int):
raise TypeError("what_grams has to be an int")
if not isinstance(how_many, int):
raise TypeError("how_many has to be an int")
if not isinstance(text, str):
raise TypeError("text has to be a str")
if (what_grams <= 0):
raise ValueError("this is bogus, give me a number from ")
elif (what_grams > 5):
raise ValueError("not doing larger-grams than 5")
if (how_many <= 0):
raise ValueError("how_many ought to be larger than 0")
if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters")
try:
text = self.text_clean_up(text)
except Exception as e:
raise e
t_len = len(text)
# list of random n-grams
r_ngrams = []
# how many times to attempt to skin the cat, dynamically set depending
# on total length of the subject text examined
insanity_threshold = t_len * 20
sanity_ctr = 0
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
# not truly random, but hey..
r_position = randint(0, t_len - 1)
if (r_position + what_grams >= (t_len - 1)):
continue
# this is the block where we start counting how many times we've
# been there
++sanity_ctr
candidate_ngram = text[r_position:r_position + what_grams]
if (candidate_ngram not in r_ngrams):
r_ngrams.append(candidate_ngram)
return r_ngrams
# return a probability list for a list of ngrams and a given language
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
if not isinstance(lang_probs, dict):
raise TypeError("lang_probs has to be a dict")
if not isinstance(ngrams, list):
raise TypeError("ngrams has to be a list")
if len(lang_probs) == 0:
raise ValueError("empty lang_probs dict")
if len(ngrams) == 0:
raise ValueError("empty ngrams list")
# may contain None values if not found, hence uncleansed
uncleansed_probabilities = []
try:
for ngram in ngrams:
uncleansed_probabilities.append(lang_probs.get(ngram))
cleansed_probabs = self.replace_nones(uncleansed_probabilities)
except Exception as e:
raise e
return cleansed_probabs
def replace_nones(self, probabilities: list):
if not isinstance(probabilities, list):
raise TypeError("not a list, bailing")
if len(probabilities) == 0:
raise ValueError("empty list, bailing")
# a pretty good bogus probability is a one close™ reasonably to zero
return [0.000000001 if n is None else n for n in probabilities]
# return proper probabilities for multiple languages
def gimme_probabs_multi_lang(
self,
langs: list,
txt: str,
what_grams: int,
how_many: int
):
if len(langs) == 0:
# fallback
langs = self.langs_to_check
probabs = []
try:
# only pick n-grams once per pass
self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt)
for lang in langs:
probabs.append(
self.gimme_probabilities(
lang,
self.da_ngrams
)
)
except Exception as e:
raise e
return probabs
def attempt_detection(self, probabs: list, lang_list: list):
import numpy as np
if not isinstance(probabs, list) or not isinstance(lang_list, list):
raise TypeError("not a list, bailing")
elif len(probabs) == 0 or len(lang_list) == 0:
raise ValueError("empty list, bailing")
sums = []
absolute_sums = []
what_we_came_up_with = []
try:
for probab in probabs:
sums.append(np.sum(np.log(probab)))
for da_sum in sums:
absolute_sums.append(1 / abs(da_sum))
for da_abs_sum in absolute_sums:
what_we_came_up_with.append(da_abs_sum / sum(absolute_sums))
except Exception as e:
raise e
print("[*] languages considered: ", end="")
print(lang_list)
print("[*] probabilities: ", end="")
print(what_we_came_up_with)
max_prob = max(what_we_came_up_with)
try:
final_results = {}
for i in range(len(lang_list)):
final_results.__setitem__(
lang_list[i],
what_we_came_up_with[i]
)
da_winner = max(final_results, key=final_results.get)
except Exception as e:
raise e
print("[*] the winner is: " + da_winner + " with probability ", end="")
print(max_prob)
return final_results
# this is the method that pulls it all together
def what_this(self, unknown_text: str, langs_to_check: list):
if not isinstance(unknown_text, str):
raise TypeError("not a string, bailing")
if not isinstance(langs_to_check, list):
raise TypeError("not a list, bailing")
if (len(unknown_text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters")
if len(langs_to_check) == 0:
print("[i] since no lang preference was provided we are falling "
+ "back to picking from ", end="")
print(self.langs_to_check)
print()
langs_to_check = self.langs_to_check
elif (langs_to_check == 1):
raise ValueError("at least two languages are needed")
print("[*] unknown text about to be checked:")
print(unknown_text)
print()
the_real_langs_to_check = []
try:
# a naïve approach, works for this project
for lang in langs_to_check:
the_real_langs_to_check.append(
self.parse_freqs(lang + ".json")
)
except Exception as e:
raise e
try:
multi_lang_probabs = self.gimme_probabs_multi_lang(
the_real_langs_to_check,
unknown_text, self.what_grams, self.how_many
)
res = self.attempt_detection(multi_lang_probabs, langs_to_check)
print("[*] recap: ", end="")
print(res)
print()
except Exception as e:
raise e
return
# vim: ff=unix noexpandtab