2021-12-16 04:36:46 +01:00
|
|
|
|
#!/usr/bin/env python3
|
2021-12-16 02:12:42 +01:00
|
|
|
|
"""
|
|
|
|
|
this file holds a class, which implements a "detector" - language classifier -
|
|
|
|
|
that learns from predefined, frequency-analysed sets of ngrams
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class da_detector:
|
2021-12-20 00:49:59 +01:00
|
|
|
|
def __init__(self, langs_to_check: list = ["sk", "en"]):
|
2021-12-16 02:12:42 +01:00
|
|
|
|
# langs to check
|
2021-12-16 04:27:50 +01:00
|
|
|
|
# to be picked from ["cz", "sk", "de", "en", "fr"]
|
2021-12-20 00:50:27 +01:00
|
|
|
|
|
|
|
|
|
if not isinstance(langs_to_check, list):
|
|
|
|
|
raise TypeError("not a list, bailing")
|
|
|
|
|
if (len(langs_to_check) < 2):
|
|
|
|
|
raise ValueError("too few languages specified")
|
2021-12-16 02:12:42 +01:00
|
|
|
|
|
|
|
|
|
def rm_interpunction(data):
|
2021-12-16 03:13:46 +01:00
|
|
|
|
from string import punctuation
|
2021-12-16 02:12:42 +01:00
|
|
|
|
for ngram in data:
|
|
|
|
|
try:
|
2021-12-16 03:13:46 +01:00
|
|
|
|
ngram = ngram.translate(str.maketrans('', '', punctuation))
|
2021-12-16 02:12:42 +01:00
|
|
|
|
except Exception as e:
|
|
|
|
|
raise e
|
2021-12-16 03:14:07 +01:00
|
|
|
|
return data
|
2021-12-16 02:12:42 +01:00
|
|
|
|
|
|
|
|
|
def rm_digits(data):
|
|
|
|
|
from string import digits
|
2021-12-16 04:29:47 +01:00
|
|
|
|
try:
|
|
|
|
|
for ngram in data:
|
2021-12-16 02:12:42 +01:00
|
|
|
|
ngram = ngram.translate(None, digits)
|
2021-12-16 04:29:47 +01:00
|
|
|
|
except Exception as e:
|
|
|
|
|
raise e
|
2021-12-16 03:14:07 +01:00
|
|
|
|
return data
|
2021-12-16 04:34:24 +01:00
|
|
|
|
|
2021-12-16 04:59:27 +01:00
|
|
|
|
def parse_freqs(self, path: str):
|
2021-12-16 04:34:24 +01:00
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
fullpath = freqs_folder + path
|
|
|
|
|
try:
|
|
|
|
|
with open(fullpath, 'r') as f:
|
|
|
|
|
j_data = f.read()
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
obj = json.loads(j_data)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise e
|
|
|
|
|
return obj
|
2021-12-16 04:35:13 +01:00
|
|
|
|
|
2021-12-17 17:24:04 +01:00
|
|
|
|
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
|
2021-12-17 17:32:49 +01:00
|
|
|
|
from random import randint
|
2021-12-16 05:06:13 +01:00
|
|
|
|
if not isinstance(what_grams, int):
|
|
|
|
|
raise TypeError("what_grams has to be an int")
|
2021-12-17 17:24:04 +01:00
|
|
|
|
if not isinstance(how_many, int):
|
|
|
|
|
raise TypeError("how_many has to be an int")
|
|
|
|
|
if not isinstance(text, str):
|
|
|
|
|
raise TypeError("text has to be a str")
|
2021-12-16 05:06:13 +01:00
|
|
|
|
|
|
|
|
|
if (what_grams <= 0):
|
|
|
|
|
raise ValueError("this is bogus, give me a number from â„•")
|
|
|
|
|
elif (what_grams > 5):
|
|
|
|
|
raise ValueError("not doing larger-grams than 5")
|
2021-12-17 17:24:04 +01:00
|
|
|
|
if (how_many <= 0):
|
|
|
|
|
raise ValueError("how_many ought to be larger than 0")
|
|
|
|
|
if (len(text) <= 10):
|
|
|
|
|
raise ValueError("not doing anything with text shorter than 10 characters")
|
2021-12-16 05:06:13 +01:00
|
|
|
|
|
2021-12-17 17:32:49 +01:00
|
|
|
|
t_len = len(text)
|
|
|
|
|
# list of random n-grams
|
|
|
|
|
r_ngrams = []
|
2021-12-20 00:58:10 +01:00
|
|
|
|
# how many times to attempt to skin the cat, dynamically set depending
|
|
|
|
|
# on total length of the subject text examined
|
|
|
|
|
insanity_threshold = t_len * 20
|
2021-12-17 17:32:49 +01:00
|
|
|
|
sanity_ctr = 0
|
|
|
|
|
|
|
|
|
|
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
|
|
|
|
|
# not truly random, but hey..
|
|
|
|
|
r_position = randint(0, t_len - 1)
|
|
|
|
|
if (r_position + what_grams >= (t_len - 1)):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# this is the block where we start counting how many times we've
|
|
|
|
|
# been there
|
|
|
|
|
++sanity_ctr
|
|
|
|
|
candidate_ngram = text[r_position:r_position + what_grams]
|
|
|
|
|
if (candidate_ngram not in r_ngrams):
|
|
|
|
|
r_ngrams.append(candidate_ngram)
|
|
|
|
|
print(r_ngrams)
|
2021-12-20 01:04:55 +01:00
|
|
|
|
return r_ngrams
|
2021-12-16 05:06:13 +01:00
|
|
|
|
|
2021-12-20 01:20:22 +01:00
|
|
|
|
# return a probability list for a list of ngrams and a given language
|
|
|
|
|
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
|
|
|
|
|
if not isinstance(lang_probs, dict):
|
|
|
|
|
raise TypeError("lang_probs has to be a dict")
|
|
|
|
|
if not isinstance(ngrams, list):
|
|
|
|
|
raise TypeError("ngrams has to be a list")
|
|
|
|
|
|
|
|
|
|
if len(lang_probs) == 0:
|
|
|
|
|
raise ValueError("empty lang_probs dict")
|
|
|
|
|
if len(ngrams) == 0:
|
|
|
|
|
raise ValueError("empty ngrams list")
|
|
|
|
|
|
|
|
|
|
# may contain None values if not found, hence uncleansed
|
|
|
|
|
uncleansed_probabilities = []
|
2021-12-20 01:36:44 +01:00
|
|
|
|
try:
|
|
|
|
|
for ngram in ngrams:
|
|
|
|
|
uncleansed_probabilities.append(lang_probs.get(ngram))
|
|
|
|
|
cleansed_probabs = self.replace_nones(uncleansed_probabilities)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise e
|
|
|
|
|
return cleansed_probabs
|
2021-12-20 01:20:22 +01:00
|
|
|
|
|
|
|
|
|
def replace_nones(self, probabilities: list):
|
|
|
|
|
if not isinstance(probabilities, list):
|
|
|
|
|
raise TypeError("not a list, bailing")
|
|
|
|
|
if len(probabilities) == 0:
|
|
|
|
|
raise ValueError("empty list, bailing")
|
|
|
|
|
# a pretty good bogus probability is a one closeâ„¢ reasonably to zero
|
|
|
|
|
return [0.000000123 if n is None else n for n in probabilities]
|
|
|
|
|
|
2021-12-20 01:48:05 +01:00
|
|
|
|
# return proper probabilities for multiple languages
|
|
|
|
|
def gimme_probabs_multi_lang(
|
|
|
|
|
self,
|
|
|
|
|
langs: list,
|
|
|
|
|
txt: str,
|
|
|
|
|
what_grams: int,
|
|
|
|
|
how_many: int
|
|
|
|
|
):
|
|
|
|
|
if len(langs) == 0:
|
|
|
|
|
# fallback
|
|
|
|
|
langs = self.langs_to_check
|
|
|
|
|
|
|
|
|
|
probabs = []
|
|
|
|
|
try:
|
|
|
|
|
for lang in langs:
|
|
|
|
|
probabs.append(
|
|
|
|
|
self.gimme_probabilities(
|
|
|
|
|
lang,
|
|
|
|
|
self.pick_ngrams(what_grams, how_many, txt)
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
except Exception as e:
|
|
|
|
|
raise e
|
|
|
|
|
return probabs
|
|
|
|
|
|
2021-12-16 04:35:13 +01:00
|
|
|
|
|
2021-12-16 04:42:16 +01:00
|
|
|
|
freqs_folder = "./freqs/"
|
2021-12-17 17:24:04 +01:00
|
|
|
|
test_str = "what freaking ever, nobody cares one bit of a heck"
|
|
|
|
|
|
2021-12-16 04:42:16 +01:00
|
|
|
|
detector = da_detector()
|
2021-12-17 17:32:49 +01:00
|
|
|
|
detector.pick_ngrams(
|
|
|
|
|
what_grams=3,
|
|
|
|
|
how_many=10,
|
|
|
|
|
text=test_str)
|
|
|
|
|
detector.pick_ngrams(
|
|
|
|
|
what_grams=2,
|
|
|
|
|
how_many=10,
|
|
|
|
|
text=test_str)
|
2021-12-16 04:42:16 +01:00
|
|
|
|
|
|
|
|
|
sk_json = detector.parse_freqs("sk.json")
|
|
|
|
|
cz_json = detector.parse_freqs("cz.json")
|
|
|
|
|
de_json = detector.parse_freqs("de.json")
|
|
|
|
|
en_json = detector.parse_freqs("en.json")
|
|
|
|
|
fr_json = detector.parse_freqs("fr.json")
|
|
|
|
|
|
2021-12-16 04:35:13 +01:00
|
|
|
|
# vim: ff=unix noexpandtab
|