This repository has been archived on 2022-02-08. You can view files and clone it, but cannot push or open issues or pull requests.
su_task2/task2.py

174 lines
5.0 KiB
Python
Raw Normal View History

2021-11-04 14:44:40 +01:00
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
def read_csv(filename):
dataset = list()
with open(filename, 'r') as file:
2021-11-06 23:53:15 +01:00
for row in reader(file):
2021-11-04 14:44:40 +01:00
if not row:
continue
dataset.append(row)
return dataset
# convert string column to float
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# convert string column to integer
def str_column_to_int(dataset, column):
class_ids = [row[column] for row in dataset]
unique = set(class_ids)
lookup = dict()
print("Class IDs:")
for i, value in enumerate(unique):
lookup[value] = i
print('[%s] => %d' % (value, i))
for row in dataset:
row[column] = lookup[row[column]]
return lookup
2021-11-06 23:53:15 +01:00
####################
# Naive Bayes iris #
####################
2021-11-04 14:44:40 +01:00
# split a dataset into n folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
fold_size = int(len(dataset) / n_folds)
for _ in range(n_folds):
fold = list()
while len(fold) < fold_size:
2021-11-06 23:53:15 +01:00
index = randrange(len(list(dataset)))
fold.append((list(dataset)).pop(index))
2021-11-04 14:44:40 +01:00
dataset_split.append(fold)
# print(dataset_split)
return dataset_split
# calculate accuracy (in per cent)
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# evaluate using a cross validation split
def evaluate_algorithm(dataset, n_folds, algorithm, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
# get a row from each fold
test_set.append(row_copy)
# we don't know the class yet
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
# calculate accuracy
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# split the dataset by class values, return a dictionary
def separate_by_class(dataset):
separated = dict()
for i in range(len(dataset)):
vector = dataset[i]
class_id = vector[-1]
if (class_id not in separated):
separated[class_id] = list()
separated[class_id].append(vector)
return separated
# mean of a list of numbers
def mean(numbers):
return sum(numbers)/float(len(numbers))
# calculate the standard deviation (sigma) of a list of numbers
def stdev(numbers):
2021-11-06 23:53:15 +01:00
variance = sum([(x-(mean(numbers)))**2 for x in numbers]) / float(len(numbers)-1)
2021-11-04 14:44:40 +01:00
return sqrt(variance)
# summarize - the mean, standard deviation (sigma) and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries
# calculate statistics for each row of the dataset split by class
def summarize_by_class(dataset):
summaries = dict()
2021-11-06 23:53:15 +01:00
for class_id, rows in (separate_by_class(dataset)).items():
2021-11-04 14:44:40 +01:00
summaries[class_id] = summarize_dataset(rows)
return summaries
# Gaussian probability distribution function for x
def gaussian_probability(x, mean, stdev):
2021-11-06 23:53:15 +01:00
return (1 / (sqrt(2 * pi) * stdev)) * (exp(-((x-mean)**2 / (2 * stdev**2 ))))
2021-11-04 14:44:40 +01:00
# calculate the probabilities of predicting each class for a given row
def class_probability_predictions(summaries, row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_id, class_summaries in summaries.items():
# for iris, we only care about 3 classes (0-2)
probabilities[class_id] = summaries[class_id][0][2]/float(total_rows)
for i in range(len(class_summaries)):
mean, stdev, _ = class_summaries[i]
probabilities[class_id] *= gaussian_probability(row[i], mean, stdev)
return probabilities
# predict the class for a given row
def predict(summaries, row):
probabilities = class_probability_predictions(summaries, row)
best_label, best_probability = None, -1
for class_id, probability in probabilities.items():
if best_label is None or probability > best_probability:
best_probability= probability
best_label = class_id
return best_label
# da thing, the magic
def naive_bayes(train, test):
summarize = summarize_by_class(train)
predictions = list()
for row in test:
output = predict(summarize, row)
predictions.append(output)
return(predictions)
# run naive bayes on the example dataset
seed(1)
filename = 'iris.txt'
dataset = read_csv(filename)
for i in range(len(dataset[0])-1):
str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
# evaluate algorithm
n_folds = 10
scores = evaluate_algorithm(dataset, n_folds, naive_bayes)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))
2021-11-06 23:53:15 +01:00
str_column_to_int(dataset, len(dataset[0])-1)
# fit model
model = summarize_by_class(dataset)
# define a new record
row = [1.7,0.8,5.3,0.2]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))