This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task3-nlp/da_detector.py

113 lines
3.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""
class da_detector:
def __init__(self, langs_to_check: list = ["sk", "en"]):
# langs to check
# to be picked from ["cz", "sk", "de", "en", "fr"]
if not isinstance(langs_to_check, list):
raise TypeError("not a list, bailing")
if (len(langs_to_check) < 2):
raise ValueError("too few languages specified")
def rm_interpunction(data):
from string import punctuation
for ngram in data:
try:
ngram = ngram.translate(str.maketrans('', '', punctuation))
except Exception as e:
raise e
return data
def rm_digits(data):
from string import digits
try:
for ngram in data:
ngram = ngram.translate(None, digits)
except Exception as e:
raise e
return data
def parse_freqs(self, path: str):
import json
fullpath = freqs_folder + path
try:
with open(fullpath, 'r') as f:
j_data = f.read()
except Exception as e:
raise e
try:
obj = json.loads(j_data)
except Exception as e:
raise e
return obj
def pick_ngrams(self, what_grams: int, how_many: int, text: str):
from random import randint
if not isinstance(what_grams, int):
raise TypeError("what_grams has to be an int")
if not isinstance(how_many, int):
raise TypeError("how_many has to be an int")
if not isinstance(text, str):
raise TypeError("text has to be a str")
if (what_grams <= 0):
raise ValueError("this is bogus, give me a number from ")
elif (what_grams > 5):
raise ValueError("not doing larger-grams than 5")
if (how_many <= 0):
raise ValueError("how_many ought to be larger than 0")
if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters")
t_len = len(text)
# list of random n-grams
r_ngrams = []
# how many times to attempt to skin the cat, dynamically set depending
# on total length of the subject text examined
insanity_threshold = t_len * 20
sanity_ctr = 0
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
# not truly random, but hey..
r_position = randint(0, t_len - 1)
if (r_position + what_grams >= (t_len - 1)):
continue
# this is the block where we start counting how many times we've
# been there
++sanity_ctr
candidate_ngram = text[r_position:r_position + what_grams]
if (candidate_ngram not in r_ngrams):
r_ngrams.append(candidate_ngram)
print(r_ngrams)
freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"
detector = da_detector()
detector.pick_ngrams(
what_grams=3,
how_many=10,
text=test_str)
detector.pick_ngrams(
what_grams=2,
how_many=10,
text=test_str)
sk_json = detector.parse_freqs("sk.json")
cz_json = detector.parse_freqs("cz.json")
de_json = detector.parse_freqs("de.json")
en_json = detector.parse_freqs("en.json")
fr_json = detector.parse_freqs("fr.json")
# vim: ff=unix noexpandtab