2021-12-16 04:36:46 +01:00
|
|
|
#!/usr/bin/env python3
|
2021-12-16 02:12:42 +01:00
|
|
|
"""
|
|
|
|
this file holds a class, which implements a "detector" - language classifier -
|
|
|
|
that learns from predefined, frequency-analysed sets of ngrams
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
class da_detector:
|
2021-12-16 04:27:50 +01:00
|
|
|
def __init__(self, langs_to_check=["sk"]):
|
2021-12-16 02:12:42 +01:00
|
|
|
# langs to check
|
2021-12-16 04:27:50 +01:00
|
|
|
# to be picked from ["cz", "sk", "de", "en", "fr"]
|
|
|
|
self.langs_to_check = langs_to_check
|
2021-12-16 02:12:42 +01:00
|
|
|
|
|
|
|
def rm_interpunction(data):
|
2021-12-16 03:13:46 +01:00
|
|
|
from string import punctuation
|
2021-12-16 02:12:42 +01:00
|
|
|
for ngram in data:
|
|
|
|
try:
|
2021-12-16 03:13:46 +01:00
|
|
|
ngram = ngram.translate(str.maketrans('', '', punctuation))
|
2021-12-16 02:12:42 +01:00
|
|
|
except Exception as e:
|
|
|
|
raise e
|
2021-12-16 03:14:07 +01:00
|
|
|
return data
|
2021-12-16 02:12:42 +01:00
|
|
|
|
|
|
|
def rm_digits(data):
|
|
|
|
from string import digits
|
2021-12-16 04:29:47 +01:00
|
|
|
try:
|
|
|
|
for ngram in data:
|
2021-12-16 02:12:42 +01:00
|
|
|
ngram = ngram.translate(None, digits)
|
2021-12-16 04:29:47 +01:00
|
|
|
except Exception as e:
|
|
|
|
raise e
|
2021-12-16 03:14:07 +01:00
|
|
|
return data
|
2021-12-16 04:34:24 +01:00
|
|
|
|
|
|
|
def parse_freqs(self, path):
|
|
|
|
import json
|
|
|
|
|
|
|
|
fullpath = freqs_folder + path
|
|
|
|
try:
|
|
|
|
with open(fullpath, 'r') as f:
|
|
|
|
j_data = f.read()
|
|
|
|
except Exception as e:
|
|
|
|
raise e
|
|
|
|
|
|
|
|
try:
|
|
|
|
obj = json.loads(j_data)
|
|
|
|
except Exception as e:
|
|
|
|
raise e
|
|
|
|
return obj
|
2021-12-16 04:35:13 +01:00
|
|
|
|
|
|
|
|
2021-12-16 04:42:16 +01:00
|
|
|
freqs_folder = "./freqs/"
|
|
|
|
detector = da_detector()
|
|
|
|
|
|
|
|
sk_json = detector.parse_freqs("sk.json")
|
|
|
|
cz_json = detector.parse_freqs("cz.json")
|
|
|
|
de_json = detector.parse_freqs("de.json")
|
|
|
|
en_json = detector.parse_freqs("en.json")
|
|
|
|
fr_json = detector.parse_freqs("fr.json")
|
|
|
|
|
2021-12-16 04:35:13 +01:00
|
|
|
# vim: ff=unix noexpandtab
|