su_task3-nlp/da_detector.py

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""


class da_detector:
	def __init__(self, langs_to_check=["sk", "en"]):
		# langs to check
		# to be picked from ["cz", "sk", "de", "en", "fr"]
		self.langs_to_check = langs_to_check

	def rm_interpunction(data):
		from string import punctuation
		for ngram in data:
			try:
				ngram = ngram.translate(str.maketrans('', '', punctuation))
			except Exception as e:
				raise e
		return data

	def rm_digits(data):
		from string import digits
		try:
			for ngram in data:
				ngram = ngram.translate(None, digits)
		except Exception as e:
			raise e
		return data

	def parse_freqs(self, path: str):
		import json

		fullpath = freqs_folder + path
		try:
			with open(fullpath, 'r') as f:
				j_data = f.read()
		except Exception as e:
			raise e

		try:
			obj = json.loads(j_data)
		except Exception as e:
			raise e
		return obj

	def pick_ngrams(self, what_grams: int, how_many: int, text: str):
		from random import randint
		if not isinstance(what_grams, int):
			raise TypeError("what_grams has to be an int")
		if not isinstance(how_many, int):
			raise TypeError("how_many has to be an int")
		if not isinstance(text, str):
			raise TypeError("text has to be a str")

		if (what_grams <= 0):
			raise ValueError("this is bogus, give me a number from ℕ")
		elif (what_grams > 5):
			raise ValueError("not doing larger-grams than 5")
		if (how_many <= 0):
			raise ValueError("how_many ought to be larger than 0")
		if (len(text) <= 10):
			raise ValueError("not doing anything with text shorter than 10 characters")

		t_len = len(text)
		# list of random n-grams
		r_ngrams = []
		# how many times to attempt to skin the cat
		insanity_threshold = 1000
		sanity_ctr = 0

		while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
			# not truly random, but hey..
			r_position = randint(0, t_len - 1)
			if (r_position + what_grams >= (t_len - 1)):
				continue

			# this is the block where we start counting how many times we've
			# been there
			++sanity_ctr
			candidate_ngram = text[r_position:r_position + what_grams]
			if (candidate_ngram not in r_ngrams):
				r_ngrams.append(candidate_ngram)
		print(r_ngrams)


freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"

detector = da_detector()
detector.pick_ngrams(
	what_grams=3,
	how_many=10,
	text=test_str)
detector.pick_ngrams(
	what_grams=2,
	how_many=10,
	text=test_str)

sk_json = detector.parse_freqs("sk.json")
cz_json = detector.parse_freqs("cz.json")
de_json = detector.parse_freqs("de.json")
en_json = detector.parse_freqs("en.json")
fr_json = detector.parse_freqs("fr.json")

# vim: ff=unix noexpandtab
-												chmod +x, add #! entrypoint

											
										
										
											2021-12-16 04:36:46 +01:00
+								#!/usr/bin/env python3
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+								"""
 								this file holds a class, which implements a "detector" - language classifier -
 								that learns from predefined, frequency-analysed sets of ngrams
 								"""
 								class da_detector:
-												fix: by default pick from at least two languages

											
										
										
											2021-12-20 00:45:05 +01:00
+									def __init__(self, langs_to_check=["sk", "en"]):
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+										# langs to check
-												da_detector: have a fallback param value of ["sk"]

											
										
										
											2021-12-16 04:27:50 +01:00
+										# to be picked from ["cz", "sk", "de", "en", "fr"]
 										self.langs_to_check = langs_to_check
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
 									def rm_interpunction(data):
-												import just what's needed

											
										
										
											2021-12-16 03:13:46 +01:00
+										from string import punctuation
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+										for ngram in data:
 											try:
-												import just what's needed

											
										
										
											2021-12-16 03:13:46 +01:00
+												ngram = ngram.translate(str.maketrans('', '', punctuation))
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+											except Exception as e:
 												raise e
-												actually do return the processed data

											
										
										
											2021-12-16 03:14:07 +01:00
+										return data
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
 									def rm_digits(data):
 										from string import digits
-												optimise: mv the try block one level up

											
										
										
											2021-12-16 04:29:47 +01:00
+										try:
 											for ngram in data:
-												initial commit

											
										
										
											2021-12-16 02:12:42 +01:00
+												ngram = ngram.translate(None, digits)
-												optimise: mv the try block one level up

											
										
										
											2021-12-16 04:29:47 +01:00
+										except Exception as e:
 											raise e
-												actually do return the processed data

											
										
										
											2021-12-16 03:14:07 +01:00
+										return data
-												add parse_freqs() method

											
										
										
											2021-12-16 04:34:24 +01:00
-												add type hint to path

											
										
										
											2021-12-16 04:59:27 +01:00
+									def parse_freqs(self, path: str):
-												add parse_freqs() method

											
										
										
											2021-12-16 04:34:24 +01:00
+										import json
 										fullpath = freqs_folder + path
 										try:
 											with open(fullpath, 'r') as f:
 												j_data = f.read()
 										except Exception as e:
 											raise e
 										try:
 											obj = json.loads(j_data)
 										except Exception as e:
 											raise e
 										return obj
-												add vim metadata

											
										
										
											2021-12-16 04:35:13 +01:00
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+									def pick_ngrams(self, what_grams: int, how_many: int, text: str):
-												pick_ngrams: complete the implementation

											
										
										
											2021-12-17 17:32:49 +01:00
+										from random import randint
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
+										if not isinstance(what_grams, int):
 											raise TypeError("what_grams has to be an int")
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+										if not isinstance(how_many, int):
 											raise TypeError("how_many has to be an int")
 										if not isinstance(text, str):
 											raise TypeError("text has to be a str")
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
 										if (what_grams <= 0):
 											raise ValueError("this is bogus, give me a number from ℕ")
 										elif (what_grams > 5):
 											raise ValueError("not doing larger-grams than 5")
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+										if (how_many <= 0):
 											raise ValueError("how_many ought to be larger than 0")
 										if (len(text) <= 10):
 											raise ValueError("not doing anything with text shorter than 10 characters")
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
-												pick_ngrams: complete the implementation

											
										
										
											2021-12-17 17:32:49 +01:00
+										t_len = len(text)
 										# list of random n-grams
 										r_ngrams = []
 										# how many times to attempt to skin the cat
 										insanity_threshold = 1000
 										sanity_ctr = 0
 										while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
 											# not truly random, but hey..
 											r_position = randint(0, t_len - 1)
 											if (r_position + what_grams >= (t_len - 1)):
 												continue
 											# this is the block where we start counting how many times we've
 											# been there
 											++sanity_ctr
 											candidate_ngram = text[r_position:r_position + what_grams]
 											if (candidate_ngram not in r_ngrams):
 												r_ngrams.append(candidate_ngram)
 										print(r_ngrams)
-												chore: add base of pick_ngrams() method

											
										
										
											2021-12-16 05:06:13 +01:00
-												add vim metadata

											
										
										
											2021-12-16 04:35:13 +01:00
-												instantiate da_detector and load jsons

for reference purposes, add the json files

											
										
										
											2021-12-16 04:42:16 +01:00
+								freqs_folder = "./freqs/"
-												pick_ngrams: add param how_many + type checks

also add a test string test_str

											
										
										
											2021-12-17 17:24:04 +01:00
+								test_str = "what freaking ever, nobody cares one bit of a heck"
-												instantiate da_detector and load jsons

for reference purposes, add the json files

											
										
										
											2021-12-16 04:42:16 +01:00
+								detector = da_detector()
-												pick_ngrams: complete the implementation

											
										
										
											2021-12-17 17:32:49 +01:00
+								detector.pick_ngrams(
 									what_grams=3,
 									how_many=10,
 									text=test_str)
 								detector.pick_ngrams(
 									what_grams=2,
 									how_many=10,
 									text=test_str)
-												instantiate da_detector and load jsons

for reference purposes, add the json files

											
										
										
											2021-12-16 04:42:16 +01:00
 								sk_json = detector.parse_freqs("sk.json")
 								cz_json = detector.parse_freqs("cz.json")
 								de_json = detector.parse_freqs("de.json")
 								en_json = detector.parse_freqs("en.json")
 								fr_json = detector.parse_freqs("fr.json")
-												add vim metadata

											
										
										
											2021-12-16 04:35:13 +01:00
+								# vim: ff=unix noexpandtab