#!/usr/bin/env python3 """ this file holds a class, which implements a "detector" - language classifier - that learns from predefined, frequency-analysed sets of ngrams """ class da_detector: def __init__(self, langs_to_check: list = ["sk", "en"]): # langs to check # to be picked from ["cz", "sk", "de", "en", "fr"] if not isinstance(langs_to_check, list): raise TypeError("not a list, bailing") if (len(langs_to_check) < 2): raise ValueError("too few languages specified") def rm_interpunction(data): from string import punctuation for ngram in data: try: ngram = ngram.translate(str.maketrans('', '', punctuation)) except Exception as e: raise e return data def rm_digits(data): from string import digits try: for ngram in data: ngram = ngram.translate(None, digits) except Exception as e: raise e return data def parse_freqs(self, path: str): import json fullpath = freqs_folder + path try: with open(fullpath, 'r') as f: j_data = f.read() except Exception as e: raise e try: obj = json.loads(j_data) except Exception as e: raise e return obj def pick_ngrams(self, what_grams: int, how_many: int, text: str): from random import randint if not isinstance(what_grams, int): raise TypeError("what_grams has to be an int") if not isinstance(how_many, int): raise TypeError("how_many has to be an int") if not isinstance(text, str): raise TypeError("text has to be a str") if (what_grams <= 0): raise ValueError("this is bogus, give me a number from ℕ") elif (what_grams > 5): raise ValueError("not doing larger-grams than 5") if (how_many <= 0): raise ValueError("how_many ought to be larger than 0") if (len(text) <= 10): raise ValueError("not doing anything with text shorter than 10 characters") t_len = len(text) # list of random n-grams r_ngrams = [] # how many times to attempt to skin the cat, dynamically set depending # on total length of the subject text examined insanity_threshold = t_len * 20 sanity_ctr = 0 while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold): # not truly random, but hey.. r_position = randint(0, t_len - 1) if (r_position + what_grams >= (t_len - 1)): continue # this is the block where we start counting how many times we've # been there ++sanity_ctr candidate_ngram = text[r_position:r_position + what_grams] if (candidate_ngram not in r_ngrams): r_ngrams.append(candidate_ngram) print(r_ngrams) return r_ngrams # return a probability list for a list of ngrams and a given language def gimme_probabilities(self, lang_probs: dict, ngrams: list): if not isinstance(lang_probs, dict): raise TypeError("lang_probs has to be a dict") if not isinstance(ngrams, list): raise TypeError("ngrams has to be a list") if len(lang_probs) == 0: raise ValueError("empty lang_probs dict") if len(ngrams) == 0: raise ValueError("empty ngrams list") # may contain None values if not found, hence uncleansed uncleansed_probabilities = [] for ngram in ngrams: uncleansed_probabilities.append(lang_probs.get(ngram)) return self.replace_nones(uncleansed_probabilities) def replace_nones(self, probabilities: list): if not isinstance(probabilities, list): raise TypeError("not a list, bailing") if len(probabilities) == 0: raise ValueError("empty list, bailing") # a pretty good bogus probability is a one close™ reasonably to zero return [0.000000123 if n is None else n for n in probabilities] freqs_folder = "./freqs/" test_str = "what freaking ever, nobody cares one bit of a heck" detector = da_detector() detector.pick_ngrams( what_grams=3, how_many=10, text=test_str) detector.pick_ngrams( what_grams=2, how_many=10, text=test_str) sk_json = detector.parse_freqs("sk.json") cz_json = detector.parse_freqs("cz.json") de_json = detector.parse_freqs("de.json") en_json = detector.parse_freqs("en.json") fr_json = detector.parse_freqs("fr.json") # vim: ff=unix noexpandtab