#!/usr/bin/env python3 """ this file holds a class, which implements a "detector" - language classifier - that learns from predefined, frequency-analysed sets of ngrams """ class da_detector: def __init__(self, langs_to_check: list = ["sk", "en"]): # langs to check # to be picked from ["cz", "sk", "de", "en", "fr"] self.da_ngrams = [] self.what_grams = 3 self.how_many = 30 self.freqs_folder = "./freqs/" if not isinstance(langs_to_check, list): raise TypeError("not a list, bailing") if (len(langs_to_check) < 2): raise ValueError("too few languages specified") self.langs_to_check = ["sk", "en"] def rm_interpunction(self, data): from string import punctuation try: data = data.translate(str.maketrans('', '', punctuation)) except Exception as e: raise e return data def rm_newlines(self, data): try: data = data.replace('\n', ' ') except Exception as e: raise e return data def rm_digits(self, data): from string import digits try: data = data.translate(str.maketrans('', '', digits)) except Exception as e: raise e return data def text_clean_up(self, data: str): data = self.rm_interpunction(data) data = self.rm_digits(data) data = self.rm_newlines(data) return data def parse_freqs(self, path: str): import json fullpath = self.freqs_folder + path try: with open(fullpath, 'r') as f: j_data = f.read() except Exception as e: raise e try: obj = json.loads(j_data) except Exception as e: raise e return obj def pick_ngrams(self, what_grams: int, how_many: int, text: str): from random import randint if not isinstance(what_grams, int): raise TypeError("what_grams has to be an int") if not isinstance(how_many, int): raise TypeError("how_many has to be an int") if not isinstance(text, str): raise TypeError("text has to be a str") if (what_grams <= 0): raise ValueError("this is bogus, give me a number from ℕ") elif (what_grams > 5): raise ValueError("not doing larger-grams than 5") if (how_many <= 0): raise ValueError("how_many ought to be larger than 0") if (len(text) <= 10): raise ValueError("not doing anything with text shorter than 10 characters") try: text = self.text_clean_up(text) except Exception as e: raise e t_len = len(text) # list of random n-grams r_ngrams = [] # how many times to attempt to skin the cat, dynamically set depending # on total length of the subject text examined insanity_threshold = t_len * 20 sanity_ctr = 0 while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold): # not truly random, but hey.. r_position = randint(0, t_len - 1) if (r_position + what_grams >= (t_len - 1)): continue # this is the block where we start counting how many times we've # been there ++sanity_ctr candidate_ngram = text[r_position:r_position + what_grams] if (candidate_ngram not in r_ngrams): r_ngrams.append(candidate_ngram) return r_ngrams # return a probability list for a list of ngrams and a given language def gimme_probabilities(self, lang_probs: dict, ngrams: list): if not isinstance(lang_probs, dict): raise TypeError("lang_probs has to be a dict") if not isinstance(ngrams, list): raise TypeError("ngrams has to be a list") if len(lang_probs) == 0: raise ValueError("empty lang_probs dict") if len(ngrams) == 0: raise ValueError("empty ngrams list") # may contain None values if not found, hence uncleansed uncleansed_probabilities = [] try: for ngram in ngrams: uncleansed_probabilities.append(lang_probs.get(ngram)) cleansed_probabs = self.replace_nones(uncleansed_probabilities) except Exception as e: raise e return cleansed_probabs def replace_nones(self, probabilities: list): if not isinstance(probabilities, list): raise TypeError("not a list, bailing") if len(probabilities) == 0: raise ValueError("empty list, bailing") # a pretty good bogus probability is a one close™ reasonably to zero return [0.000000001 if n is None else n for n in probabilities] # return proper probabilities for multiple languages def gimme_probabs_multi_lang( self, langs: list, txt: str, what_grams: int, how_many: int ): if len(langs) == 0: # fallback langs = self.langs_to_check probabs = [] try: # only pick n-grams once per pass self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt) for lang in langs: probabs.append( self.gimme_probabilities( lang, self.da_ngrams ) ) except Exception as e: raise e return probabs def attempt_detection(self, probabs: list, lang_list: list): import numpy as np if not isinstance(probabs, list) or not isinstance(lang_list, list): raise TypeError("not a list, bailing") elif len(probabs) == 0 or len(lang_list) == 0: raise ValueError("empty list, bailing") sums = [] absolute_sums = [] what_we_came_up_with = [] try: for probab in probabs: sums.append(np.sum(np.log(probab))) for da_sum in sums: absolute_sums.append(1 / abs(da_sum)) for da_abs_sum in absolute_sums: what_we_came_up_with.append(da_abs_sum / sum(absolute_sums)) except Exception as e: raise e print("[*] languages considered: ", end="") print(lang_list) print("[*] probabilities: ", end="") print(what_we_came_up_with) max_prob = max(what_we_came_up_with) try: final_results = {} for i in range(len(lang_list)): final_results.__setitem__( lang_list[i], what_we_came_up_with[i] ) da_winner = max(final_results, key=final_results.get) except Exception as e: raise e print("[*] the winner is: " + da_winner + " with probability ", end="") print(max_prob) return final_results # this is the method that pulls it all together def what_this(self, unknown_text: str, langs_to_check: list): if not isinstance(unknown_text, str): raise TypeError("not a string, bailing") if not isinstance(langs_to_check, list): raise TypeError("not a list, bailing") if (len(unknown_text) <= 10): raise ValueError("not doing anything with text shorter than 10 characters") if len(langs_to_check) == 0: print("[i] since no lang preference was provided we are falling " + "back to picking from ", end="") print(self.langs_to_check) print() langs_to_check = self.langs_to_check elif (langs_to_check == 1): raise ValueError("at least two languages are needed") print("[*] unknown text about to be checked:") print(unknown_text) print() the_real_langs_to_check = [] try: # a naïve approach, works for this project for lang in langs_to_check: the_real_langs_to_check.append( self.parse_freqs(lang + ".json") ) except Exception as e: raise e try: multi_lang_probabs = self.gimme_probabs_multi_lang( the_real_langs_to_check, unknown_text, self.what_grams, self.how_many ) res = self.attempt_detection(multi_lang_probabs, langs_to_check) print("[*] recap: ", end="") print(res) print() except Exception as e: raise e return # vim: ff=unix noexpandtab