su_task3-nlp/da_detector.py

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""


class da_detector:
	def __init__(self, langs_to_check: list = ["sk", "en"]):
		# langs to check
		# to be picked from ["cz", "sk", "de", "en", "fr"]
		self.da_ngrams = []
		self.what_grams = 3
		self.how_many = 30
		self.freqs_folder = "./freqs/"

		if not isinstance(langs_to_check, list):
			raise TypeError("not a list, bailing")
		if (len(langs_to_check) < 2):
			raise ValueError("too few languages specified")
		self.langs_to_check = ["sk", "en"]

	def rm_interpunction(self, data):
		from string import punctuation
		try:
			data = data.translate(str.maketrans('', '', punctuation))
		except Exception as e:
			raise e
		return data

	def rm_newlines(self, data):
		try:
			data = data.replace('\n', ' ')
		except Exception as e:
			raise e
		return data

	def rm_digits(self, data):
		from string import digits
		try:
			data = data.translate(str.maketrans('', '', digits))
		except Exception as e:
			raise e
		return data

	def text_clean_up(self, data: str):
		data = self.rm_interpunction(data)
		data = self.rm_digits(data)
		data = self.rm_newlines(data)
		return data

	def parse_freqs(self, path: str):
		import json

		fullpath = self.freqs_folder + path
		try:
			with open(fullpath, 'r') as f:
				j_data = f.read()
		except Exception as e:
			raise e

		try:
			obj = json.loads(j_data)
		except Exception as e:
			raise e
		return obj

	def pick_ngrams(self, what_grams: int, how_many: int, text: str):
		from random import randint
		if not isinstance(what_grams, int):
			raise TypeError("what_grams has to be an int")
		if not isinstance(how_many, int):
			raise TypeError("how_many has to be an int")
		if not isinstance(text, str):
			raise TypeError("text has to be a str")

		if (what_grams <= 0):
			raise ValueError("this is bogus, give me a number from ℕ")
		elif (what_grams > 5):
			raise ValueError("not doing larger-grams than 5")
		if (how_many <= 0):
			raise ValueError("how_many ought to be larger than 0")
		if (len(text) <= 10):
			raise ValueError("not doing anything with text shorter than 10 characters")

		try:
			text = self.text_clean_up(text)
		except Exception as e:
			raise e

		t_len = len(text)
		# list of random n-grams
		r_ngrams = []
		# how many times to attempt to skin the cat, dynamically set depending
		# on total length of the subject text examined
		insanity_threshold = t_len * 20
		sanity_ctr = 0

		while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
			# not truly random, but hey..
			r_position = randint(0, t_len - 1)
			if (r_position + what_grams >= (t_len - 1)):
				continue

			# this is the block where we start counting how many times we've
			# been there
			++sanity_ctr
			candidate_ngram = text[r_position:r_position + what_grams]
			if (candidate_ngram not in r_ngrams):
				r_ngrams.append(candidate_ngram)
		return r_ngrams

	# return a probability list for a list of ngrams and a given language
	def gimme_probabilities(self, lang_probs: dict, ngrams: list):
		if not isinstance(lang_probs, dict):
			raise TypeError("lang_probs has to be a dict")
		if not isinstance(ngrams, list):
			raise TypeError("ngrams has to be a list")

		if len(lang_probs) == 0:
			raise ValueError("empty lang_probs dict")
		if len(ngrams) == 0:
			raise ValueError("empty ngrams list")

		# may contain None values if not found, hence uncleansed
		uncleansed_probabilities = []
		try:
			for ngram in ngrams:
				uncleansed_probabilities.append(lang_probs.get(ngram))
			cleansed_probabs = self.replace_nones(uncleansed_probabilities)
		except Exception as e:
			raise e
		return cleansed_probabs

	def replace_nones(self, probabilities: list):
		if not isinstance(probabilities, list):
			raise TypeError("not a list, bailing")
		if len(probabilities) == 0:
			raise ValueError("empty list, bailing")
		# a pretty good bogus probability is a one close™ reasonably to zero
		return [0.000000001 if n is None else n for n in probabilities]

	# return proper probabilities for multiple languages
	def gimme_probabs_multi_lang(
		self,
		langs: list,
		txt: str,
		what_grams: int,
		how_many: int
	):
		if len(langs) == 0:
			# fallback
			langs = self.langs_to_check

		probabs = []
		try:
			# only pick n-grams once per pass
			self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt)
			for lang in langs:
				probabs.append(
					self.gimme_probabilities(
						lang,
						self.da_ngrams
					)
				)
		except Exception as e:
			raise e
		return probabs

	def attempt_detection(self, probabs: list, lang_list: list):
		import numpy as np

		if not isinstance(probabs, list) or not isinstance(lang_list, list):
			raise TypeError("not a list, bailing")
		elif len(probabs) == 0 or len(lang_list) == 0:
			raise ValueError("empty list, bailing")

		sums = []
		absolute_sums = []
		what_we_came_up_with = []

		try:
			for probab in probabs:
				sums.append(np.sum(np.log(probab)))

			for da_sum in sums:
				absolute_sums.append(1 / abs(da_sum))

			for da_abs_sum in absolute_sums:
				what_we_came_up_with.append(da_abs_sum / sum(absolute_sums))
		except Exception as e:
			raise e

		print("[*] languages considered: ", end="")
		print(lang_list)
		print("[*] probabilities: ", end="")
		print(what_we_came_up_with)
		max_prob = max(what_we_came_up_with)

		try:
			final_results = {}
			for i in range(len(lang_list)):
				final_results.__setitem__(
					lang_list[i],
					what_we_came_up_with[i]
				)
			da_winner = max(final_results, key=final_results.get)
		except Exception as e:
			raise e
		print("[*] the winner is: " + da_winner + " with probability ", end="")
		print(max_prob)
		return final_results

	# this is the method that pulls it all together
	def what_this(self, unknown_text: str, langs_to_check: list):
		if not isinstance(unknown_text, str):
			raise TypeError("not a string, bailing")
		if not isinstance(langs_to_check, list):
			raise TypeError("not a list, bailing")

		if (len(unknown_text) <= 10):
			raise ValueError("not doing anything with text shorter than 10 characters")
		if len(langs_to_check) == 0:
			print("[i] since no lang preference was provided we are falling "
					+ "back to picking from ", end="")
			print(self.langs_to_check)
			print()
			langs_to_check = self.langs_to_check
		elif (langs_to_check == 1):
			raise ValueError("at least two languages are needed")

		print("[*] unknown text about to be checked:")
		print(unknown_text)
		print()

		the_real_langs_to_check = []
		try:
			# a naïve approach, works for this project
			for lang in langs_to_check:
				the_real_langs_to_check.append(
					self.parse_freqs(lang + ".json")
				)
		except Exception as e:
			raise e

		try:
			multi_lang_probabs = self.gimme_probabs_multi_lang(
				the_real_langs_to_check,
				unknown_text, self.what_grams, self.how_many
			)

			res = self.attempt_detection(multi_lang_probabs, langs_to_check)
			print("[*] recap: ", end="")
			print(res)
			print()
		except Exception as e:
			raise e

		return

# vim: ff=unix noexpandtab
-												chmod +x, add #! entrypoint

											
										
										
											2021-12-16 04:36:46 +01:00
+								#!/usr/bin/env python3
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+								"""
 								this file holds a class, which implements a "detector" - language classifier -
 								that learns from predefined, frequency-analysed sets of ngrams
 								"""
 								class da_detector:
-												chore: add a hint to langs_to_check

											
										
										
											2021-12-20 00:49:59 +01:00
+									def __init__(self, langs_to_check: list = ["sk", "en"]):
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+										# langs to check
-												da_detector: have a fallback param value of ["sk"]

											
										
										
											2021-12-16 04:27:50 +01:00
+										# to be picked from ["cz", "sk", "de", "en", "fr"]
-												fix: have stable n-grams picked once per pass

..so that the detection is performed on the same set for every lang

											
										
										
											2021-12-20 02:35:56 +01:00
+										self.da_ngrams = []
-												chore: add fields with defaults

											
										
										
											2021-12-20 03:58:45 +01:00
+										self.what_grams = 3
 										self.how_many = 30
-												make freqs_folder a class field

											
										
										
											2021-12-20 04:52:10 +01:00
+										self.freqs_folder = "./freqs/"
-												add basic checks for langs_to_check, rm local var

											
										
										
											2021-12-20 00:50:27 +01:00
 										if not isinstance(langs_to_check, list):
 											raise TypeError("not a list, bailing")
 										if (len(langs_to_check) < 2):
 											raise ValueError("too few languages specified")
-												chore: add fields with defaults

											
										
										
											2021-12-20 03:58:45 +01:00
+										self.langs_to_check = ["sk", "en"]
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+									def rm_interpunction(self, data):
-												import just what's needed

											
										
										
											2021-12-16 03:13:46 +01:00
+										from string import punctuation
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+										try:
 											data = data.translate(str.maketrans('', '', punctuation))
 										except Exception as e:
 											raise e
 										return data
 									def rm_newlines(self, data):
 										try:
-												replace newlines with spaces

whereas before they were simply discarded

											
										
										
											2021-12-20 05:24:08 +01:00
+											data = data.replace('\n', ' ')
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+										except Exception as e:
 											raise e
-												actually do return the processed data

											
										
										
											2021-12-16 03:14:07 +01:00
+										return data
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+									def rm_digits(self, data):
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+										from string import digits
-												optimise: mv the try block one level up

											
										
										
											2021-12-16 04:29:47 +01:00
+										try:
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+											data = data.translate(str.maketrans('', '', digits))
-												optimise: mv the try block one level up

											
										
										
											2021-12-16 04:29:47 +01:00
+										except Exception as e:
 											raise e
-												actually do return the processed data

											
										
										
											2021-12-16 03:14:07 +01:00
+										return data
-												add parse_freqs() method

											
										
										
											2021-12-16 04:34:24 +01:00
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+									def text_clean_up(self, data: str):
 										data = self.rm_interpunction(data)
 										data = self.rm_digits(data)
 										data = self.rm_newlines(data)
 										return data
-												add type hint to path

											
										
										
											2021-12-16 04:59:27 +01:00
+									def parse_freqs(self, path: str):
-												add parse_freqs() method

											
										
										
											2021-12-16 04:34:24 +01:00
+										import json
-												make freqs_folder a class field

											
										
										
											2021-12-20 04:52:10 +01:00
+										fullpath = self.freqs_folder + path
-												add parse_freqs() method

											
										
										
											2021-12-16 04:34:24 +01:00
+										try:
 											with open(fullpath, 'r') as f:
 												j_data = f.read()
 										except Exception as e:
 											raise e
 										try:
 											obj = json.loads(j_data)
 										except Exception as e:
 											raise e
 										return obj
-												add vim metadata

											
										
										
											2021-12-16 04:35:13 +01:00
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+									def pick_ngrams(self, what_grams: int, how_many: int, text: str):
-												pick_ngrams: complete the implementation

											
										
										
											2021-12-17 17:32:49 +01:00
+										from random import randint
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
+										if not isinstance(what_grams, int):
 											raise TypeError("what_grams has to be an int")
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+										if not isinstance(how_many, int):
 											raise TypeError("how_many has to be an int")
 										if not isinstance(text, str):
 											raise TypeError("text has to be a str")
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
 										if (what_grams <= 0):
 											raise ValueError("this is bogus, give me a number from ℕ")
 										elif (what_grams > 5):
 											raise ValueError("not doing larger-grams than 5")
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+										if (how_many <= 0):
 											raise ValueError("how_many ought to be larger than 0")
 										if (len(text) <= 10):
 											raise ValueError("not doing anything with text shorter than 10 characters")
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
-												fix(text parsing): actually perform clean-up

											
										
										
											2021-12-20 05:21:00 +01:00
+										try:
 											text = self.text_clean_up(text)
 										except Exception as e:
 											raise e
-												pick_ngrams: complete the implementation

											
										
										
											2021-12-17 17:32:49 +01:00
+										t_len = len(text)
 										# list of random n-grams
 										r_ngrams = []
-												chore: set insanity_threshold dynamically

											
										
										
											2021-12-20 00:58:10 +01:00
+										# how many times to attempt to skin the cat, dynamically set depending
 										# on total length of the subject text examined
 										insanity_threshold = t_len * 20
-												pick_ngrams: complete the implementation

											
										
										
											2021-12-17 17:32:49 +01:00
+										sanity_ctr = 0
 										while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
 											# not truly random, but hey..
 											r_position = randint(0, t_len - 1)
 											if (r_position + what_grams >= (t_len - 1)):
 												continue
 											# this is the block where we start counting how many times we've
 											# been there
 											++sanity_ctr
 											candidate_ngram = text[r_position:r_position + what_grams]
 											if (candidate_ngram not in r_ngrams):
 												r_ngrams.append(candidate_ngram)
-												chore: actually return the list of random n-grams

											
										
										
											2021-12-20 01:04:55 +01:00
+										return r_ngrams
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
-												add gimme_probabilities and replace_nones

methods for "converting" a list of ngrams into a list of probabilities
these ngrams occur in a given language, replacing "None" values on the
fly with a very low "probability of occurence" value

											
										
										
											2021-12-20 01:20:22 +01:00
+									# return a probability list for a list of ngrams and a given language
 									def gimme_probabilities(self, lang_probs: dict, ngrams: list):
 										if not isinstance(lang_probs, dict):
 											raise TypeError("lang_probs has to be a dict")
 										if not isinstance(ngrams, list):
 											raise TypeError("ngrams has to be a list")
 										if len(lang_probs) == 0:
 											raise ValueError("empty lang_probs dict")
 										if len(ngrams) == 0:
 											raise ValueError("empty ngrams list")
 										# may contain None values if not found, hence uncleansed
 										uncleansed_probabilities = []
-												add a "try" wrapper for when getting probabs

											
										
										
											2021-12-20 01:36:44 +01:00
+										try:
 											for ngram in ngrams:
 												uncleansed_probabilities.append(lang_probs.get(ngram))
 											cleansed_probabs = self.replace_nones(uncleansed_probabilities)
 										except Exception as e:
 											raise e
 										return cleansed_probabs
-												add gimme_probabilities and replace_nones

methods for "converting" a list of ngrams into a list of probabilities
these ngrams occur in a given language, replacing "None" values on the
fly with a very low "probability of occurence" value

											
										
										
											2021-12-20 01:20:22 +01:00
 									def replace_nones(self, probabilities: list):
 										if not isinstance(probabilities, list):
 											raise TypeError("not a list, bailing")
 										if len(probabilities) == 0:
 											raise ValueError("empty list, bailing")
 										# a pretty good bogus probability is a one close™ reasonably to zero
-												make missing n-grams have greater overall impact

											
										
										
											2021-12-20 04:00:16 +01:00
+										return [0.000000001 if n is None else n for n in probabilities]
-												add gimme_probabilities and replace_nones

methods for "converting" a list of ngrams into a list of probabilities
these ngrams occur in a given language, replacing "None" values on the
fly with a very low "probability of occurence" value

											
										
										
											2021-12-20 01:20:22 +01:00
-												add gimme_probabs_multi_lang

this method operates on a list of language probability dictionaries,
passes most of the parameters to other instance methods defined earlier

											
										
										
											2021-12-20 01:48:05 +01:00
+									# return proper probabilities for multiple languages
 									def gimme_probabs_multi_lang(
 										self,
 										langs: list,
 										txt: str,
 										what_grams: int,
 										how_many: int
 									):
 										if len(langs) == 0:
 											# fallback
 											langs = self.langs_to_check
 										probabs = []
 										try:
-												fix: have stable n-grams picked once per pass

..so that the detection is performed on the same set for every lang

											
										
										
											2021-12-20 02:35:56 +01:00
+											# only pick n-grams once per pass
 											self.da_ngrams = self.pick_ngrams(what_grams, how_many, txt)
-												add gimme_probabs_multi_lang

this method operates on a list of language probability dictionaries,
passes most of the parameters to other instance methods defined earlier

											
										
										
											2021-12-20 01:48:05 +01:00
+											for lang in langs:
 												probabs.append(
 													self.gimme_probabilities(
 														lang,
-												fix: have stable n-grams picked once per pass

..so that the detection is performed on the same set for every lang

											
										
										
											2021-12-20 02:35:56 +01:00
+														self.da_ngrams
-												add gimme_probabs_multi_lang

this method operates on a list of language probability dictionaries,
passes most of the parameters to other instance methods defined earlier

											
										
										
											2021-12-20 01:48:05 +01:00
+													)
 												)
 										except Exception as e:
 											raise e
 										return probabs
-												add attempt_detection method

											
										
										
											2021-12-20 04:32:45 +01:00
+									def attempt_detection(self, probabs: list, lang_list: list):
 										import numpy as np
 										if not isinstance(probabs, list) or not isinstance(lang_list, list):
 											raise TypeError("not a list, bailing")
 										elif len(probabs) == 0 or len(lang_list) == 0:
 											raise ValueError("empty list, bailing")
 										sums = []
 										absolute_sums = []
 										what_we_came_up_with = []
 										try:
 											for probab in probabs:
 												sums.append(np.sum(np.log(probab)))
 											for da_sum in sums:
 												absolute_sums.append(1 / abs(da_sum))
 											for da_abs_sum in absolute_sums:
 												what_we_came_up_with.append(da_abs_sum / sum(absolute_sums))
 										except Exception as e:
 											raise e
 										print("[*] languages considered: ", end="")
 										print(lang_list)
 										print("[*] probabilities: ", end="")
 										print(what_we_came_up_with)
 										max_prob = max(what_we_came_up_with)
 										try:
 											final_results = {}
 											for i in range(len(lang_list)):
 												final_results.__setitem__(
 													lang_list[i],
 													what_we_came_up_with[i]
 												)
 											da_winner = max(final_results, key=final_results.get)
 										except Exception as e:
 											raise e
 										print("[*] the winner is: " + da_winner + " with probability ", end="")
 										print(max_prob)
 										return final_results
-												add what_this method that interfaces with the user

											
										
										
											2021-12-20 04:53:07 +01:00
+									# this is the method that pulls it all together
 									def what_this(self, unknown_text: str, langs_to_check: list):
 										if not isinstance(unknown_text, str):
 											raise TypeError("not a string, bailing")
 										if not isinstance(langs_to_check, list):
 											raise TypeError("not a list, bailing")
-												add vim metadata

											
										
										
											2021-12-16 04:35:13 +01:00
-												add what_this method that interfaces with the user

											
										
										
											2021-12-20 04:53:07 +01:00
+										if (len(unknown_text) <= 10):
 											raise ValueError("not doing anything with text shorter than 10 characters")
 										if len(langs_to_check) == 0:
 											print("[i] since no lang preference was provided we are falling "
 													+ "back to picking from ", end="")
 											print(self.langs_to_check)
 											print()
 											langs_to_check = self.langs_to_check
 										elif (langs_to_check == 1):
 											raise ValueError("at least two languages are needed")
 										print("[*] unknown text about to be checked:")
 										print(unknown_text)
 										print()
 										the_real_langs_to_check = []
 										try:
 											# a naïve approach, works for this project
 											for lang in langs_to_check:
 												the_real_langs_to_check.append(
 													self.parse_freqs(lang + ".json")
 												)
 										except Exception as e:
 											raise e
 										try:
 											multi_lang_probabs = self.gimme_probabs_multi_lang(
 												the_real_langs_to_check,
 												unknown_text, self.what_grams, self.how_many
 											)
 											res = self.attempt_detection(multi_lang_probabs, langs_to_check)
 											print("[*] recap: ", end="")
 											print(res)
 											print()
 										except Exception as e:
 											raise e
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
-												add what_this method that interfaces with the user

											
										
										
											2021-12-20 04:53:07 +01:00
+										return
-												instantiate da_detector and load jsons

for reference purposes, add the json files

											
										
										
											2021-12-16 04:42:16 +01:00
-												add vim metadata

											
										
										
											2021-12-16 04:35:13 +01:00
+								# vim: ff=unix noexpandtab