27 lines
626 B
Python
27 lines
626 B
Python
"""
|
|
this file holds a class, which implements a "detector" - language classifier -
|
|
that learns from predefined, frequency-analysed sets of ngrams
|
|
"""
|
|
|
|
|
|
class da_detector:
|
|
def __init__(self, langs_to_check):
|
|
# langs to check
|
|
self.langs_to_check = ["cz", "sk", "de", "en", "fr"]
|
|
|
|
def rm_interpunction(data):
|
|
import string
|
|
for ngram in data:
|
|
try:
|
|
ngram = ngram.translate(str.maketrans('', '', string.punctuation))
|
|
except Exception as e:
|
|
raise e
|
|
|
|
def rm_digits(data):
|
|
from string import digits
|
|
for ngram in data:
|
|
try:
|
|
ngram = ngram.translate(None, digits)
|
|
except Exception as e:
|
|
raise e
|