add gimme_probabilities and replace_nones

methods for "converting" a list of ngrams into a list of probabilities
these ngrams occur in a given language, replacing "None" values on the
fly with a very low "probability of occurence" value
This commit is contained in:
surtur 2021-12-20 01:20:22 +01:00
parent 3650d89fe1
commit 789ccd5856
Signed by: wanderer
GPG Key ID: 19CE1EC1D9E0486D

@ -90,6 +90,32 @@ class da_detector:
print(r_ngrams)
return r_ngrams
# return a probability list for a list of ngrams and a given language
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
if not isinstance(lang_probs, dict):
raise TypeError("lang_probs has to be a dict")
if not isinstance(ngrams, list):
raise TypeError("ngrams has to be a list")
if len(lang_probs) == 0:
raise ValueError("empty lang_probs dict")
if len(ngrams) == 0:
raise ValueError("empty ngrams list")
# may contain None values if not found, hence uncleansed
uncleansed_probabilities = []
for ngram in ngrams:
uncleansed_probabilities.append(lang_probs.get(ngram))
return self.replace_nones(uncleansed_probabilities)
def replace_nones(self, probabilities: list):
if not isinstance(probabilities, list):
raise TypeError("not a list, bailing")
if len(probabilities) == 0:
raise ValueError("empty list, bailing")
# a pretty good bogus probability is a one close™ reasonably to zero
return [0.000000123 if n is None else n for n in probabilities]
freqs_folder = "./freqs/"
test_str = "what freaking ever, nobody cares one bit of a heck"