This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task3-nlp/da_detector.py
2021-12-16 04:59:27 +01:00

59 lines
1.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
this file holds a class, which implements a "detector" - language classifier -
that learns from predefined, frequency-analysed sets of ngrams
"""
class da_detector:
def __init__(self, langs_to_check=["sk"]):
# langs to check
# to be picked from ["cz", "sk", "de", "en", "fr"]
self.langs_to_check = langs_to_check
def rm_interpunction(data):
from string import punctuation
for ngram in data:
try:
ngram = ngram.translate(str.maketrans('', '', punctuation))
except Exception as e:
raise e
return data
def rm_digits(data):
from string import digits
try:
for ngram in data:
ngram = ngram.translate(None, digits)
except Exception as e:
raise e
return data
def parse_freqs(self, path: str):
import json
fullpath = freqs_folder + path
try:
with open(fullpath, 'r') as f:
j_data = f.read()
except Exception as e:
raise e
try:
obj = json.loads(j_data)
except Exception as e:
raise e
return obj
freqs_folder = "./freqs/"
detector = da_detector()
sk_json = detector.parse_freqs("sk.json")
cz_json = detector.parse_freqs("cz.json")
de_json = detector.parse_freqs("de.json")
en_json = detector.parse_freqs("en.json")
fr_json = detector.parse_freqs("fr.json")
# vim: ff=unix noexpandtab