#!/usr/bin/env python3 """ this file holds a class, which implements a "detector" - language classifier - that learns from predefined, frequency-analysed sets of ngrams """ class da_detector: def __init__(self, langs_to_check: list = ["sk", "en"]): # langs to check # to be picked from ["cz", "sk", "de", "en", "fr"] self.da_ngrams = [] self.what_grams = 3 self.how_many = 30 if not isinstance(langs_to_check, list): raise TypeError("not a list, bailing") if (len(langs_to_check) < 2): raise ValueError("too few languages specified") self.langs_to_check = ["sk", "en"] def rm_interpunction(data): from string import punctuation for ngram in data: try: ngram = ngram.translate(str.maketrans('', '', punctuation)) except Exception as e: raise e return data def rm_digits(data): from string import digits try: for ngram in data: ngram = ngram.translate(None, digits) except Exception as e: raise e return data def parse_freqs(self, path: str): import json fullpath = freqs_folder + path try: with open(fullpath, 'r') as f: j_data = f.read() except Exception as e: raise e try: obj = json.loads(j_data) except Exception as e: raise e return obj def pick_ngrams(self, what_grams: int, how_many: int, text: str): from random import randint if not isinstance(what_grams, int): raise TypeError("what_grams has to be an int") if not isinstance(how_many, int): raise TypeError("how_many has to be an int") if not isinstance(text, str): raise TypeError("text has to be a str") if (what_grams <= 0): raise ValueError("this is bogus, give me a number from ℕ") elif (what_grams > 5): raise ValueError("not doing larger-grams than 5") if (how_many <= 0): raise ValueError("how_many ought to be larger than 0") if (len(text) <= 10): raise ValueError("not doing anything with text shorter than 10 characters") t_len = len(text) # list of random n-grams r_ngrams = [] # how many times to attempt to skin the cat, dynamically set depending # on total length of the subject text examined insanity_threshold = t_len * 20 sanity_ctr = 0 while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold): # not truly random, but hey.. r_position = randint(0, t_len - 1) if (r_position + what_grams >= (t_len - 1)): continue # this is the block where we start counting how many times we've # been there ++sanity_ctr candidate_ngram = text[r_position:r_position + what_grams] if (candidate_ngram not in r_ngrams): r_ngrams.append(candidate_ngram) return r_ngrams # return a probability list for a list of ngrams and a given language def gimme_probabilities(self, lang_probs: dict, ngrams: list): if not isinstance(lang_probs, dict): raise TypeError("lang_probs has to be a dict") if not isinstance(ngrams, list): raise TypeError("ngrams has to be a list") if len(lang_probs) == 0: raise ValueError("empty lang_probs dict") if len(ngrams) == 0: raise ValueError("empty ngrams list") # may contain None values if not found, hence uncleansed uncleansed_probabilities = [] try: for ngram in ngrams: uncleansed_probabilities.append(lang_probs.get(ngram)) cleansed_probabs = self.replace_nones(uncleansed_probabilities) except Exception as e: raise e return cleansed_probabs def replace_nones(self, probabilities: list): if not isinstance(probabilities, list): raise TypeError("not a list, bailing") if len(probabilities) == 0: raise ValueError("empty list, bailing") # a pretty good bogus probability is a one close™ reasonably to zero return [0.000000001 if n is None else n for n in probabilities] # return proper probabilities for multiple languages def gimme_probabs_multi_lang( self, langs: list, txt: str, what_grams: int, how_many: int ): if len(langs) == 0: # fallback langs = self.langs_to_check probabs = [] try: # only pick n-grams once per pass self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt) for lang in langs: probabs.append( self.gimme_probabilities( lang, self.da_ngrams ) ) except Exception as e: raise e return probabs def attempt_detection(self, probabs: list, lang_list: list): import numpy as np if not isinstance(probabs, list) or not isinstance(lang_list, list): raise TypeError("not a list, bailing") elif len(probabs) == 0 or len(lang_list) == 0: raise ValueError("empty list, bailing") sums = [] absolute_sums = [] what_we_came_up_with = [] try: for probab in probabs: sums.append(np.sum(np.log(probab))) for da_sum in sums: absolute_sums.append(1 / abs(da_sum)) for da_abs_sum in absolute_sums: what_we_came_up_with.append(da_abs_sum / sum(absolute_sums)) except Exception as e: raise e print("[*] languages considered: ", end="") print(lang_list) print("[*] probabilities: ", end="") print(what_we_came_up_with) max_prob = max(what_we_came_up_with) try: final_results = {} for i in range(len(lang_list)): final_results.__setitem__( lang_list[i], what_we_came_up_with[i] ) da_winner = max(final_results, key=final_results.get) except Exception as e: raise e print("[*] the winner is: " + da_winner + " with probability ", end="") print(max_prob) return final_results freqs_folder = "./freqs/" test_str = "what freaking ever, nobody cares one bit of a heck" detector = da_detector() # vim: ff=unix noexpandtab