add gimme_probabilities and replace_nones
methods for "converting" a list of ngrams into a list of probabilities these ngrams occur in a given language, replacing "None" values on the fly with a very low "probability of occurence" value
This commit is contained in:
parent
3650d89fe1
commit
789ccd5856
@ -90,6 +90,32 @@ class da_detector:
|
||||
print(r_ngrams)
|
||||
return r_ngrams
|
||||
|
||||
# return a probability list for a list of ngrams and a given language
|
||||
def gimme_probabilities(self, lang_probs: dict, ngrams: list):
|
||||
if not isinstance(lang_probs, dict):
|
||||
raise TypeError("lang_probs has to be a dict")
|
||||
if not isinstance(ngrams, list):
|
||||
raise TypeError("ngrams has to be a list")
|
||||
|
||||
if len(lang_probs) == 0:
|
||||
raise ValueError("empty lang_probs dict")
|
||||
if len(ngrams) == 0:
|
||||
raise ValueError("empty ngrams list")
|
||||
|
||||
# may contain None values if not found, hence uncleansed
|
||||
uncleansed_probabilities = []
|
||||
for ngram in ngrams:
|
||||
uncleansed_probabilities.append(lang_probs.get(ngram))
|
||||
return self.replace_nones(uncleansed_probabilities)
|
||||
|
||||
def replace_nones(self, probabilities: list):
|
||||
if not isinstance(probabilities, list):
|
||||
raise TypeError("not a list, bailing")
|
||||
if len(probabilities) == 0:
|
||||
raise ValueError("empty list, bailing")
|
||||
# a pretty good bogus probability is a one close™ reasonably to zero
|
||||
return [0.000000123 if n is None else n for n in probabilities]
|
||||
|
||||
|
||||
freqs_folder = "./freqs/"
|
||||
test_str = "what freaking ever, nobody cares one bit of a heck"
|
||||
|
Reference in New Issue
Block a user