add gimme_probabilities and replace_nones
methods for "converting" a list of ngrams into a list of probabilities these ngrams occur in a given language, replacing "None" values on the fly with a very low "probability of occurence" value
This commit is contained in:
parent
3650d89fe1
commit
789ccd5856
@ -90,6 +90,32 @@ class da_detector:
|
|||||||
print(r_ngrams)
|
print(r_ngrams)
|
||||||
return r_ngrams
|
return r_ngrams
|
||||||
|
|
||||||
|
# return a probability list for a list of ngrams and a given language
|
||||||
|
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
|
||||||
|
if not isinstance(lang_probs, dict):
|
||||||
|
raise TypeError("lang_probs has to be a dict")
|
||||||
|
if not isinstance(ngrams, list):
|
||||||
|
raise TypeError("ngrams has to be a list")
|
||||||
|
|
||||||
|
if len(lang_probs) == 0:
|
||||||
|
raise ValueError("empty lang_probs dict")
|
||||||
|
if len(ngrams) == 0:
|
||||||
|
raise ValueError("empty ngrams list")
|
||||||
|
|
||||||
|
# may contain None values if not found, hence uncleansed
|
||||||
|
uncleansed_probabilities = []
|
||||||
|
for ngram in ngrams:
|
||||||
|
uncleansed_probabilities.append(lang_probs.get(ngram))
|
||||||
|
return self.replace_nones(uncleansed_probabilities)
|
||||||
|
|
||||||
|
def replace_nones(self, probabilities: list):
|
||||||
|
if not isinstance(probabilities, list):
|
||||||
|
raise TypeError("not a list, bailing")
|
||||||
|
if len(probabilities) == 0:
|
||||||
|
raise ValueError("empty list, bailing")
|
||||||
|
# a pretty good bogus probability is a one close™ reasonably to zero
|
||||||
|
return [0.000000123 if n is None else n for n in probabilities]
|
||||||
|
|
||||||
|
|
||||||
freqs_folder = "./freqs/"
|
freqs_folder = "./freqs/"
|
||||||
test_str = "what freaking ever, nobody cares one bit of a heck"
|
test_str = "what freaking ever, nobody cares one bit of a heck"
|
||||||
|
Reference in New Issue
Block a user