This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task3-nlp/da_detector.py

160 lines
4.3 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""
class da_detector:
def __init__(self, langs_to_check: list = ["sk", "en"]):
# langs to check
# to be picked from ["cz", "sk", "de", "en", "fr"]
self.da_ngrams = []
self.what_grams = 3
self.how_many = 30
if not isinstance(langs_to_check, list):
raise TypeError("not a list, bailing")
if (len(langs_to_check) < 2):
raise ValueError("too few languages specified")
self.langs_to_check = ["sk", "en"]
def rm_interpunction(data):
from string import punctuation
for ngram in data:
try:
ngram = ngram.translate(str.maketrans('', '', punctuation))
except Exception as e:
raise e
return data
def rm_digits(data):
from string import digits
try:
for ngram in data:
ngram = ngram.translate(None, digits)
except Exception as e:
raise e
return data
def parse_freqs(self, path: str):
import json
fullpath = freqs_folder + path
try:
with open(fullpath, 'r') as f:
j_data = f.read()
except Exception as e:
raise e
try:
obj = json.loads(j_data)
except Exception as e:
raise e
return obj
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
from random import randint
if not isinstance(what_grams, int):
raise TypeError("what_grams has to be an int")
if not isinstance(how_many, int):
raise TypeError("how_many has to be an int")
if not isinstance(text, str):
raise TypeError("text has to be a str")
if (what_grams <= 0):
raise ValueError("this is bogus, give me a number from ")
elif (what_grams > 5):
raise ValueError("not doing larger-grams than 5")
if (how_many <= 0):
raise ValueError("how_many ought to be larger than 0")
if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters")
t_len = len(text)
# list of random n-grams
r_ngrams = []
# how many times to attempt to skin the cat, dynamically set depending
# on total length of the subject text examined
insanity_threshold = t_len * 20
sanity_ctr = 0
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
# not truly random, but hey..
r_position = randint(0, t_len - 1)
if (r_position + what_grams >= (t_len - 1)):
continue
# this is the block where we start counting how many times we've
# been there
++sanity_ctr
candidate_ngram = text[r_position:r_position + what_grams]
if (candidate_ngram not in r_ngrams):
r_ngrams.append(candidate_ngram)
return r_ngrams
# return a probability list for a list of ngrams and a given language
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
if not isinstance(lang_probs, dict):
raise TypeError("lang_probs has to be a dict")
if not isinstance(ngrams, list):
raise TypeError("ngrams has to be a list")
if len(lang_probs) == 0:
raise ValueError("empty lang_probs dict")
if len(ngrams) == 0:
raise ValueError("empty ngrams list")
# may contain None values if not found, hence uncleansed
uncleansed_probabilities = []
try:
for ngram in ngrams:
uncleansed_probabilities.append(lang_probs.get(ngram))
cleansed_probabs = self.replace_nones(uncleansed_probabilities)
except Exception as e:
raise e
return cleansed_probabs
def replace_nones(self, probabilities: list):
if not isinstance(probabilities, list):
raise TypeError("not a list, bailing")
if len(probabilities) == 0:
raise ValueError("empty list, bailing")
# a pretty good bogus probability is a one close™ reasonably to zero
return [0.000000001 if n is None else n for n in probabilities]
# return proper probabilities for multiple languages
def gimme_probabs_multi_lang(
self,
langs: list,
txt: str,
what_grams: int,
how_many: int
):
if len(langs) == 0:
# fallback
langs = self.langs_to_check
probabs = []
try:
# only pick n-grams once per pass
self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt)
for lang in langs:
probabs.append(
self.gimme_probabilities(
lang,
self.da_ngrams
)
)
except Exception as e:
raise e
return probabs
freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"
detector = da_detector()
# vim: ff=unix noexpandtab