From 4ee9cf5af61fa967e795b6011a9bf8d8bc004982 Mon Sep 17 00:00:00 2001 From: surtur Date: Mon, 20 Dec 2021 05:21:00 +0100 Subject: [PATCH] fix(text parsing): actually perform clean-up --- da_detector.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/da_detector.py b/da_detector.py index 5ca86e1..a88ecc8 100755 --- a/da_detector.py +++ b/da_detector.py @@ -20,24 +20,35 @@ class da_detector: raise ValueError("too few languages specified") self.langs_to_check = ["sk", "en"] - def rm_interpunction(data): + def rm_interpunction(self, data): from string import punctuation - for ngram in data: - try: - ngram = ngram.translate(str.maketrans('', '', punctuation)) - except Exception as e: - raise e - return data - - def rm_digits(data): - from string import digits try: - for ngram in data: - ngram = ngram.translate(None, digits) + data = data.translate(str.maketrans('', '', punctuation)) except Exception as e: raise e return data + def rm_newlines(self, data): + try: + data = data.replace('\n', '') + except Exception as e: + raise e + return data + + def rm_digits(self, data): + from string import digits + try: + data = data.translate(str.maketrans('', '', digits)) + except Exception as e: + raise e + return data + + def text_clean_up(self, data: str): + data = self.rm_interpunction(data) + data = self.rm_digits(data) + data = self.rm_newlines(data) + return data + def parse_freqs(self, path: str): import json @@ -72,6 +83,11 @@ class da_detector: if (len(text) <= 10): raise ValueError("not doing anything with text shorter than 10 characters") + try: + text = self.text_clean_up(text) + except Exception as e: + raise e + t_len = len(text) # list of random n-grams r_ngrams = []