This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task3-nlp/da_detector.py
surtur 789ccd5856
add gimme_probabilities and replace_nones
methods for "converting" a list of ngrams into a list of probabilities
these ngrams occur in a given language, replacing "None" values on the
fly with a very low "probability of occurence" value
2021-12-20 01:20:22 +01:00

140 lines
4.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""
class da_detector:
def __init__(self, langs_to_check: list = ["sk", "en"]):
# langs to check
# to be picked from ["cz", "sk", "de", "en", "fr"]
if not isinstance(langs_to_check, list):
raise TypeError("not a list, bailing")
if (len(langs_to_check) < 2):
raise ValueError("too few languages specified")
def rm_interpunction(data):
from string import punctuation
for ngram in data:
try:
ngram = ngram.translate(str.maketrans('', '', punctuation))
except Exception as e:
raise e
return data
def rm_digits(data):
from string import digits
try:
for ngram in data:
ngram = ngram.translate(None, digits)
except Exception as e:
raise e
return data
def parse_freqs(self, path: str):
import json
fullpath = freqs_folder + path
try:
with open(fullpath, 'r') as f:
j_data = f.read()
except Exception as e:
raise e
try:
obj = json.loads(j_data)
except Exception as e:
raise e
return obj
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
from random import randint
if not isinstance(what_grams, int):
raise TypeError("what_grams has to be an int")
if not isinstance(how_many, int):
raise TypeError("how_many has to be an int")
if not isinstance(text, str):
raise TypeError("text has to be a str")
if (what_grams <= 0):
raise ValueError("this is bogus, give me a number from ")
elif (what_grams > 5):
raise ValueError("not doing larger-grams than 5")
if (how_many <= 0):
raise ValueError("how_many ought to be larger than 0")
if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters")
t_len = len(text)
# list of random n-grams
r_ngrams = []
# how many times to attempt to skin the cat, dynamically set depending
# on total length of the subject text examined
insanity_threshold = t_len * 20
sanity_ctr = 0
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
# not truly random, but hey..
r_position = randint(0, t_len - 1)
if (r_position + what_grams >= (t_len - 1)):
continue
# this is the block where we start counting how many times we've
# been there
++sanity_ctr
candidate_ngram = text[r_position:r_position + what_grams]
if (candidate_ngram not in r_ngrams):
r_ngrams.append(candidate_ngram)
print(r_ngrams)
return r_ngrams
# return a probability list for a list of ngrams and a given language
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
if not isinstance(lang_probs, dict):
raise TypeError("lang_probs has to be a dict")
if not isinstance(ngrams, list):
raise TypeError("ngrams has to be a list")
if len(lang_probs) == 0:
raise ValueError("empty lang_probs dict")
if len(ngrams) == 0:
raise ValueError("empty ngrams list")
# may contain None values if not found, hence uncleansed
uncleansed_probabilities = []
for ngram in ngrams:
uncleansed_probabilities.append(lang_probs.get(ngram))
return self.replace_nones(uncleansed_probabilities)
def replace_nones(self, probabilities: list):
if not isinstance(probabilities, list):
raise TypeError("not a list, bailing")
if len(probabilities) == 0:
raise ValueError("empty list, bailing")
# a pretty good bogus probability is a one close™ reasonably to zero
return [0.000000123 if n is None else n for n in probabilities]
freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"
detector = da_detector()
detector.pick_ngrams(
what_grams=3,
how_many=10,
text=test_str)
detector.pick_ngrams(
what_grams=2,
how_many=10,
text=test_str)
sk_json = detector.parse_freqs("sk.json")
cz_json = detector.parse_freqs("cz.json")
de_json = detector.parse_freqs("de.json")
en_json = detector.parse_freqs("en.json")
fr_json = detector.parse_freqs("fr.json")
# vim: ff=unix noexpandtab