fix(text parsing): actually perform clean-up
This commit is contained in:
parent
e2ca88c247
commit
4ee9cf5af6
@ -20,24 +20,35 @@ class da_detector:
|
|||||||
raise ValueError("too few languages specified")
|
raise ValueError("too few languages specified")
|
||||||
self.langs_to_check = ["sk", "en"]
|
self.langs_to_check = ["sk", "en"]
|
||||||
|
|
||||||
def rm_interpunction(data):
|
def rm_interpunction(self, data):
|
||||||
from string import punctuation
|
from string import punctuation
|
||||||
for ngram in data:
|
|
||||||
try:
|
|
||||||
ngram = ngram.translate(str.maketrans('', '', punctuation))
|
|
||||||
except Exception as e:
|
|
||||||
raise e
|
|
||||||
return data
|
|
||||||
|
|
||||||
def rm_digits(data):
|
|
||||||
from string import digits
|
|
||||||
try:
|
try:
|
||||||
for ngram in data:
|
data = data.translate(str.maketrans('', '', punctuation))
|
||||||
ngram = ngram.translate(None, digits)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def rm_newlines(self, data):
|
||||||
|
try:
|
||||||
|
data = data.replace('\n', '')
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
return data
|
||||||
|
|
||||||
|
def rm_digits(self, data):
|
||||||
|
from string import digits
|
||||||
|
try:
|
||||||
|
data = data.translate(str.maketrans('', '', digits))
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
return data
|
||||||
|
|
||||||
|
def text_clean_up(self, data: str):
|
||||||
|
data = self.rm_interpunction(data)
|
||||||
|
data = self.rm_digits(data)
|
||||||
|
data = self.rm_newlines(data)
|
||||||
|
return data
|
||||||
|
|
||||||
def parse_freqs(self, path: str):
|
def parse_freqs(self, path: str):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@ -72,6 +83,11 @@ class da_detector:
|
|||||||
if (len(text) <= 10):
|
if (len(text) <= 10):
|
||||||
raise ValueError("not doing anything with text shorter than 10 characters")
|
raise ValueError("not doing anything with text shorter than 10 characters")
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = self.text_clean_up(text)
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
t_len = len(text)
|
t_len = len(text)
|
||||||
# list of random n-grams
|
# list of random n-grams
|
||||||
r_ngrams = []
|
r_ngrams = []
|
||||||
|
Reference in New Issue
Block a user