262 lines
7.0 KiB
Python
Executable File
262 lines
7.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
this file holds a class, which implements a "detector" - language classifier -
|
||
that learns from predefined, frequency-analysed sets of ngrams
|
||
"""
|
||
|
||
|
||
class da_detector:
|
||
def __init__(self, langs_to_check: list = ["sk", "en"]):
|
||
# langs to check
|
||
# to be picked from ["cz", "sk", "de", "en", "fr"]
|
||
self.da_ngrams = []
|
||
self.what_grams = 3
|
||
self.how_many = 30
|
||
self.freqs_folder = "./freqs/"
|
||
|
||
if not isinstance(langs_to_check, list):
|
||
raise TypeError("not a list, bailing")
|
||
if (len(langs_to_check) < 2):
|
||
raise ValueError("too few languages specified")
|
||
self.langs_to_check = ["sk", "en"]
|
||
|
||
def rm_interpunction(self, data):
|
||
from string import punctuation
|
||
try:
|
||
data = data.translate(str.maketrans('', '', punctuation))
|
||
except Exception as e:
|
||
raise e
|
||
return data
|
||
|
||
def rm_newlines(self, data):
|
||
try:
|
||
data = data.replace('\n', ' ')
|
||
except Exception as e:
|
||
raise e
|
||
return data
|
||
|
||
def rm_digits(self, data):
|
||
from string import digits
|
||
try:
|
||
data = data.translate(str.maketrans('', '', digits))
|
||
except Exception as e:
|
||
raise e
|
||
return data
|
||
|
||
def text_clean_up(self, data: str):
|
||
data = self.rm_interpunction(data)
|
||
data = self.rm_digits(data)
|
||
data = self.rm_newlines(data)
|
||
return data
|
||
|
||
def parse_freqs(self, path: str):
|
||
import json
|
||
|
||
fullpath = self.freqs_folder + path
|
||
try:
|
||
with open(fullpath, 'r') as f:
|
||
j_data = f.read()
|
||
except Exception as e:
|
||
raise e
|
||
|
||
try:
|
||
obj = json.loads(j_data)
|
||
except Exception as e:
|
||
raise e
|
||
return obj
|
||
|
||
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
|
||
from random import randint
|
||
if not isinstance(what_grams, int):
|
||
raise TypeError("what_grams has to be an int")
|
||
if not isinstance(how_many, int):
|
||
raise TypeError("how_many has to be an int")
|
||
if not isinstance(text, str):
|
||
raise TypeError("text has to be a str")
|
||
|
||
if (what_grams <= 0):
|
||
raise ValueError("this is bogus, give me a number from ℕ")
|
||
elif (what_grams > 5):
|
||
raise ValueError("not doing larger-grams than 5")
|
||
if (how_many <= 0):
|
||
raise ValueError("how_many ought to be larger than 0")
|
||
if (len(text) <= 10):
|
||
raise ValueError("not doing anything with text shorter than 10 characters")
|
||
|
||
try:
|
||
text = self.text_clean_up(text)
|
||
except Exception as e:
|
||
raise e
|
||
|
||
t_len = len(text)
|
||
# list of random n-grams
|
||
r_ngrams = []
|
||
# how many times to attempt to skin the cat, dynamically set depending
|
||
# on total length of the subject text examined
|
||
insanity_threshold = t_len * 20
|
||
sanity_ctr = 0
|
||
|
||
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
|
||
# not truly random, but hey..
|
||
r_position = randint(0, t_len - 1)
|
||
if (r_position + what_grams >= (t_len - 1)):
|
||
continue
|
||
|
||
# this is the block where we start counting how many times we've
|
||
# been there
|
||
++sanity_ctr
|
||
candidate_ngram = text[r_position:r_position + what_grams]
|
||
if (candidate_ngram not in r_ngrams):
|
||
r_ngrams.append(candidate_ngram)
|
||
return r_ngrams
|
||
|
||
# return a probability list for a list of ngrams and a given language
|
||
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
|
||
if not isinstance(lang_probs, dict):
|
||
raise TypeError("lang_probs has to be a dict")
|
||
if not isinstance(ngrams, list):
|
||
raise TypeError("ngrams has to be a list")
|
||
|
||
if len(lang_probs) == 0:
|
||
raise ValueError("empty lang_probs dict")
|
||
if len(ngrams) == 0:
|
||
raise ValueError("empty ngrams list")
|
||
|
||
# may contain None values if not found, hence uncleansed
|
||
uncleansed_probabilities = []
|
||
try:
|
||
for ngram in ngrams:
|
||
uncleansed_probabilities.append(lang_probs.get(ngram))
|
||
cleansed_probabs = self.replace_nones(uncleansed_probabilities)
|
||
except Exception as e:
|
||
raise e
|
||
return cleansed_probabs
|
||
|
||
def replace_nones(self, probabilities: list):
|
||
if not isinstance(probabilities, list):
|
||
raise TypeError("not a list, bailing")
|
||
if len(probabilities) == 0:
|
||
raise ValueError("empty list, bailing")
|
||
# a pretty good bogus probability is a one close™ reasonably to zero
|
||
return [0.000000001 if n is None else n for n in probabilities]
|
||
|
||
# return proper probabilities for multiple languages
|
||
def gimme_probabs_multi_lang(
|
||
self,
|
||
langs: list,
|
||
txt: str,
|
||
what_grams: int,
|
||
how_many: int
|
||
):
|
||
if len(langs) == 0:
|
||
# fallback
|
||
langs = self.langs_to_check
|
||
|
||
probabs = []
|
||
try:
|
||
# only pick n-grams once per pass
|
||
self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt)
|
||
for lang in langs:
|
||
probabs.append(
|
||
self.gimme_probabilities(
|
||
lang,
|
||
self.da_ngrams
|
||
)
|
||
)
|
||
except Exception as e:
|
||
raise e
|
||
return probabs
|
||
|
||
def attempt_detection(self, probabs: list, lang_list: list):
|
||
import numpy as np
|
||
|
||
if not isinstance(probabs, list) or not isinstance(lang_list, list):
|
||
raise TypeError("not a list, bailing")
|
||
elif len(probabs) == 0 or len(lang_list) == 0:
|
||
raise ValueError("empty list, bailing")
|
||
|
||
sums = []
|
||
absolute_sums = []
|
||
what_we_came_up_with = []
|
||
|
||
try:
|
||
for probab in probabs:
|
||
sums.append(np.sum(np.log(probab)))
|
||
|
||
for da_sum in sums:
|
||
absolute_sums.append(1 / abs(da_sum))
|
||
|
||
for da_abs_sum in absolute_sums:
|
||
what_we_came_up_with.append(da_abs_sum / sum(absolute_sums))
|
||
except Exception as e:
|
||
raise e
|
||
|
||
print("[*] languages considered: ", end="")
|
||
print(lang_list)
|
||
print("[*] probabilities: ", end="")
|
||
print(what_we_came_up_with)
|
||
max_prob = max(what_we_came_up_with)
|
||
|
||
try:
|
||
final_results = {}
|
||
for i in range(len(lang_list)):
|
||
final_results.__setitem__(
|
||
lang_list[i],
|
||
what_we_came_up_with[i]
|
||
)
|
||
da_winner = max(final_results, key=final_results.get)
|
||
except Exception as e:
|
||
raise e
|
||
print("[*] the winner is: " + da_winner + " with probability ", end="")
|
||
print(max_prob)
|
||
return final_results
|
||
|
||
# this is the method that pulls it all together
|
||
def what_this(self, unknown_text: str, langs_to_check: list):
|
||
if not isinstance(unknown_text, str):
|
||
raise TypeError("not a string, bailing")
|
||
if not isinstance(langs_to_check, list):
|
||
raise TypeError("not a list, bailing")
|
||
|
||
if (len(unknown_text) <= 10):
|
||
raise ValueError("not doing anything with text shorter than 10 characters")
|
||
if len(langs_to_check) == 0:
|
||
print("[i] since no lang preference was provided we are falling "
|
||
+ "back to picking from ", end="")
|
||
print(self.langs_to_check)
|
||
print()
|
||
langs_to_check = self.langs_to_check
|
||
elif (langs_to_check == 1):
|
||
raise ValueError("at least two languages are needed")
|
||
|
||
print("[*] unknown text about to be checked:")
|
||
print(unknown_text)
|
||
print()
|
||
|
||
the_real_langs_to_check = []
|
||
try:
|
||
# a naïve approach, works for this project
|
||
for lang in langs_to_check:
|
||
the_real_langs_to_check.append(
|
||
self.parse_freqs(lang + ".json")
|
||
)
|
||
except Exception as e:
|
||
raise e
|
||
|
||
try:
|
||
multi_lang_probabs = self.gimme_probabs_multi_lang(
|
||
the_real_langs_to_check,
|
||
unknown_text, self.what_grams, self.how_many
|
||
)
|
||
|
||
res = self.attempt_detection(multi_lang_probabs, langs_to_check)
|
||
print("[*] recap: ", end="")
|
||
print(res)
|
||
print()
|
||
except Exception as e:
|
||
raise e
|
||
|
||
return
|
||
|
||
# vim: ff=unix noexpandtab
|