fix(text parsing): actually perform clean-up

This commit is contained in:
surtur 2021-12-20 05:21:00 +01:00
parent e2ca88c247
commit 4ee9cf5af6
Signed by: wanderer
GPG Key ID: 19CE1EC1D9E0486D

@ -20,24 +20,35 @@ class da_detector:
raise ValueError("too few languages specified") raise ValueError("too few languages specified")
self.langs_to_check = ["sk", "en"] self.langs_to_check = ["sk", "en"]
def rm_interpunction(data): def rm_interpunction(self, data):
from string import punctuation from string import punctuation
for ngram in data:
try: try:
ngram = ngram.translate(str.maketrans('', '', punctuation)) data = data.translate(str.maketrans('', '', punctuation))
except Exception as e: except Exception as e:
raise e raise e
return data return data
def rm_digits(data): def rm_newlines(self, data):
from string import digits
try: try:
for ngram in data: data = data.replace('\n', '')
ngram = ngram.translate(None, digits)
except Exception as e: except Exception as e:
raise e raise e
return data return data
def rm_digits(self, data):
from string import digits
try:
data = data.translate(str.maketrans('', '', digits))
except Exception as e:
raise e
return data
def text_clean_up(self, data: str):
data = self.rm_interpunction(data)
data = self.rm_digits(data)
data = self.rm_newlines(data)
return data
def parse_freqs(self, path: str): def parse_freqs(self, path: str):
import json import json
@ -72,6 +83,11 @@ class da_detector:
if (len(text) <= 10): if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters") raise ValueError("not doing anything with text shorter than 10 characters")
try:
text = self.text_clean_up(text)
except Exception as e:
raise e
t_len = len(text) t_len = len(text)
# list of random n-grams # list of random n-grams
r_ngrams = [] r_ngrams = []