This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task3-nlp/da_detector.py

108 lines
2.7 KiB
Python
Raw Normal View History

2021-12-16 04:36:46 +01:00
#!/usr/bin/env python3
2021-12-16 02:12:42 +01:00
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""
class da_detector:
def __init__(self, langs_to_check=["sk", "en"]):
2021-12-16 02:12:42 +01:00
# langs to check
# to be picked from ["cz", "sk", "de", "en", "fr"]
self.langs_to_check = langs_to_check
2021-12-16 02:12:42 +01:00
def rm_interpunction(data):
2021-12-16 03:13:46 +01:00
from string import punctuation
2021-12-16 02:12:42 +01:00
for ngram in data:
try:
2021-12-16 03:13:46 +01:00
ngram = ngram.translate(str.maketrans('', '', punctuation))
2021-12-16 02:12:42 +01:00
except Exception as e:
raise e
2021-12-16 03:14:07 +01:00
return data
2021-12-16 02:12:42 +01:00
def rm_digits(data):
from string import digits
try:
for ngram in data:
2021-12-16 02:12:42 +01:00
ngram = ngram.translate(None, digits)
except Exception as e:
raise e
2021-12-16 03:14:07 +01:00
return data
2021-12-16 04:34:24 +01:00
2021-12-16 04:59:27 +01:00
def parse_freqs(self, path: str):
2021-12-16 04:34:24 +01:00
import json
fullpath = freqs_folder + path
try:
with open(fullpath, 'r') as f:
j_data = f.read()
except Exception as e:
raise e
try:
obj = json.loads(j_data)
except Exception as e:
raise e
return obj
2021-12-16 04:35:13 +01:00
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
from random import randint
if not isinstance(what_grams, int):
raise TypeError("what_grams has to be an int")
if not isinstance(how_many, int):
raise TypeError("how_many has to be an int")
if not isinstance(text, str):
raise TypeError("text has to be a str")
if (what_grams <= 0):
raise ValueError("this is bogus, give me a number from â„•")
elif (what_grams > 5):
raise ValueError("not doing larger-grams than 5")
if (how_many <= 0):
raise ValueError("how_many ought to be larger than 0")
if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters")
t_len = len(text)
# list of random n-grams
r_ngrams = []
# how many times to attempt to skin the cat
insanity_threshold = 1000
sanity_ctr = 0
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
# not truly random, but hey..
r_position = randint(0, t_len - 1)
if (r_position + what_grams >= (t_len - 1)):
continue
# this is the block where we start counting how many times we've
# been there
++sanity_ctr
candidate_ngram = text[r_position:r_position + what_grams]
if (candidate_ngram not in r_ngrams):
r_ngrams.append(candidate_ngram)
print(r_ngrams)
2021-12-16 04:35:13 +01:00
freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"
detector = da_detector()
detector.pick_ngrams(
what_grams=3,
how_many=10,
text=test_str)
detector.pick_ngrams(
what_grams=2,
how_many=10,
text=test_str)
sk_json = detector.parse_freqs("sk.json")
cz_json = detector.parse_freqs("cz.json")
de_json = detector.parse_freqs("de.json")
en_json = detector.parse_freqs("en.json")
fr_json = detector.parse_freqs("fr.json")
2021-12-16 04:35:13 +01:00
# vim: ff=unix noexpandtab