fix(text parsing): actually perform clean-up
This commit is contained in:
parent
e2ca88c247
commit
4ee9cf5af6
@ -20,24 +20,35 @@ class da_detector:
|
||||
raise ValueError("too few languages specified")
|
||||
self.langs_to_check = ["sk", "en"]
|
||||
|
||||
def rm_interpunction(data):
|
||||
def rm_interpunction(self, data):
|
||||
from string import punctuation
|
||||
for ngram in data:
|
||||
try:
|
||||
ngram = ngram.translate(str.maketrans('', '', punctuation))
|
||||
data = data.translate(str.maketrans('', '', punctuation))
|
||||
except Exception as e:
|
||||
raise e
|
||||
return data
|
||||
|
||||
def rm_digits(data):
|
||||
from string import digits
|
||||
def rm_newlines(self, data):
|
||||
try:
|
||||
for ngram in data:
|
||||
ngram = ngram.translate(None, digits)
|
||||
data = data.replace('\n', '')
|
||||
except Exception as e:
|
||||
raise e
|
||||
return data
|
||||
|
||||
def rm_digits(self, data):
|
||||
from string import digits
|
||||
try:
|
||||
data = data.translate(str.maketrans('', '', digits))
|
||||
except Exception as e:
|
||||
raise e
|
||||
return data
|
||||
|
||||
def text_clean_up(self, data: str):
|
||||
data = self.rm_interpunction(data)
|
||||
data = self.rm_digits(data)
|
||||
data = self.rm_newlines(data)
|
||||
return data
|
||||
|
||||
def parse_freqs(self, path: str):
|
||||
import json
|
||||
|
||||
@ -72,6 +83,11 @@ class da_detector:
|
||||
if (len(text) <= 10):
|
||||
raise ValueError("not doing anything with text shorter than 10 characters")
|
||||
|
||||
try:
|
||||
text = self.text_clean_up(text)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
t_len = len(text)
|
||||
# list of random n-grams
|
||||
r_ngrams = []
|
||||
|
Reference in New Issue
Block a user