pick_ngrams: complete the implementation

This commit is contained in:
surtur 2021-12-17 17:32:49 +01:00
parent b56662dd87
commit 225b683541
Signed by: wanderer
GPG Key ID: 19CE1EC1D9E0486D

@ -46,6 +46,7 @@ class da_detector:
return obj return obj
def pick_ngrams(self, what_grams: int, how_many: int, text: str): def pick_ngrams(self, what_grams: int, how_many: int, text: str):
from random import randint
if not isinstance(what_grams, int): if not isinstance(what_grams, int):
raise TypeError("what_grams has to be an int") raise TypeError("what_grams has to be an int")
if not isinstance(how_many, int): if not isinstance(how_many, int):
@ -62,14 +63,40 @@ class da_detector:
if (len(text) <= 10): if (len(text) <= 10):
raise ValueError("not doing anything with text shorter than 10 characters") raise ValueError("not doing anything with text shorter than 10 characters")
# TODO(me): complete n-gram picking method implementation t_len = len(text)
# list of random n-grams
r_ngrams = []
# how many times to attempt to skin the cat
insanity_threshold = 1000
sanity_ctr = 0
while (len(r_ngrams) < how_many and sanity_ctr < insanity_threshold):
# not truly random, but hey..
r_position = randint(0, t_len - 1)
if (r_position + what_grams >= (t_len - 1)):
continue
# this is the block where we start counting how many times we've
# been there
++sanity_ctr
candidate_ngram = text[r_position:r_position + what_grams]
if (candidate_ngram not in r_ngrams):
r_ngrams.append(candidate_ngram)
print(r_ngrams)
freqs_folder = "./freqs/" freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck" test_str = "what freaking ever, nobody cares one bit of a heck"
detector = da_detector() detector = da_detector()
detector.pick_ngrams(3, 10, test_str) detector.pick_ngrams(
what_grams=3,
how_many=10,
text=test_str)
detector.pick_ngrams(
what_grams=2,
how_many=10,
text=test_str)
sk_json = detector.parse_freqs("sk.json") sk_json = detector.parse_freqs("sk.json")
cz_json = detector.parse_freqs("cz.json") cz_json = detector.parse_freqs("cz.json")