su_task3-nlp/da_detector.py

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""


class da_detector:
	def __init__(self, langs_to_check: list = ["sk", "en"]):
		# langs to check
		# to be picked from ["cz", "sk", "de", "en", "fr"]
		self.da_ngrams = []
		self.what_grams = 3
		self.how_many = 30

		if not isinstance(langs_to_check, list):
			raise TypeError("not a list, bailing")
		if (len(langs_to_check) < 2):
			raise ValueError("too few languages specified")
		self.langs_to_check = ["sk", "en"]

	def rm_interpunction(data):
		from string import punctuation
		for ngram in data:
			try:
				ngram = ngram.translate(str.maketrans('', '', punctuation))
			except Exception as e:
				raise e
		return data

	def rm_digits(data):
		from string import digits
		try:
			for ngram in data:
				ngram = ngram.translate(None, digits)
		except Exception as e:
			raise e
		return data

	def parse_freqs(self, path: str):
		import json

		fullpath = freqs_folder + path
		try:
			with open(fullpath, 'r') as f:
				j_data = f.read()
		except Exception as e:
			raise e

		try:
			obj = json.loads(j_data)
		except Exception as e:
			raise e
		return obj

	def pick_ngrams(self, what_grams: int, how_many: int, text: str):
		from random import randint
		if not isinstance(what_grams, int):
			raise TypeError("what_grams has to be an int")
		if not isinstance(how_many, int):
			raise TypeError("how_many has to be an int")
		if not isinstance(text, str):
			raise TypeError("text has to be a str")

		if (what_grams <= 0):
			raise ValueError("this is bogus, give me a number from ℕ")
		elif (what_grams > 5):
			raise ValueError("not doing larger-grams than 5")
		if (how_many <= 0):
			raise ValueError("how_many ought to be larger than 0")
		if (len(text) <= 10):
			raise ValueError("not doing anything with text shorter than 10 characters")

		t_len = len(text)
		# list of random n-grams
		r_ngrams = []
		# how many times to attempt to skin the cat, dynamically set depending
		# on total length of the subject text examined
		insanity_threshold = t_len * 20
		sanity_ctr = 0

		while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
			# not truly random, but hey..
			r_position = randint(0, t_len - 1)
			if (r_position + what_grams >= (t_len - 1)):
				continue

			# this is the block where we start counting how many times we've
			# been there
			++sanity_ctr
			candidate_ngram = text[r_position:r_position + what_grams]
			if (candidate_ngram not in r_ngrams):
				r_ngrams.append(candidate_ngram)
		return r_ngrams

	# return a probability list for a list of ngrams and a given language
	def gimme_probabilities(self, lang_probs: dict, ngrams: list):
		if not isinstance(lang_probs, dict):
			raise TypeError("lang_probs has to be a dict")
		if not isinstance(ngrams, list):
			raise TypeError("ngrams has to be a list")

		if len(lang_probs) == 0:
			raise ValueError("empty lang_probs dict")
		if len(ngrams) == 0:
			raise ValueError("empty ngrams list")

		# may contain None values if not found, hence uncleansed
		uncleansed_probabilities = []
		try:
			for ngram in ngrams:
				uncleansed_probabilities.append(lang_probs.get(ngram))
			cleansed_probabs = self.replace_nones(uncleansed_probabilities)
		except Exception as e:
			raise e
		return cleansed_probabs

	def replace_nones(self, probabilities: list):
		if not isinstance(probabilities, list):
			raise TypeError("not a list, bailing")
		if len(probabilities) == 0:
			raise ValueError("empty list, bailing")
		# a pretty good bogus probability is a one close™ reasonably to zero
		return [0.000000001 if n is None else n for n in probabilities]

	# return proper probabilities for multiple languages
	def gimme_probabs_multi_lang(
		self,
		langs: list,
		txt: str,
		what_grams: int,
		how_many: int
	):
		if len(langs) == 0:
			# fallback
			langs = self.langs_to_check

		probabs = []
		try:
			# only pick n-grams once per pass
			self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt)
			for lang in langs:
				probabs.append(
					self.gimme_probabilities(
						lang,
						self.da_ngrams
					)
				)
		except Exception as e:
			raise e
		return probabs


freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"

detector = da_detector()

# vim: ff=unix noexpandtab