From eb73080dd4ddf126ec87d876f6c585e1f6a5cdc6 Mon Sep 17 00:00:00 2001 From: surtur Date: Thu, 4 Nov 2021 14:44:40 +0100 Subject: [PATCH] initial commit --- iris.txt | 150 +++++++++++++++++++++++++++++++++++++++++++++++++ task2.py | 166 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 316 insertions(+) create mode 100644 iris.txt create mode 100644 task2.py diff --git a/iris.txt b/iris.txt new file mode 100644 index 0000000..a3490e0 --- /dev/null +++ b/iris.txt @@ -0,0 +1,150 @@ +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica diff --git a/task2.py b/task2.py new file mode 100644 index 0000000..68e25d7 --- /dev/null +++ b/task2.py @@ -0,0 +1,166 @@ +from csv import reader +from random import seed +from random import randrange +from math import sqrt +from math import exp +from math import pi + +def read_csv(filename): + dataset = list() + with open(filename, 'r') as file: + csv_reader = reader(file) + for row in csv_reader: + if not row: + continue + dataset.append(row) + return dataset + +# convert string column to float +def str_column_to_float(dataset, column): + for row in dataset: + row[column] = float(row[column].strip()) + +# convert string column to integer +def str_column_to_int(dataset, column): + class_ids = [row[column] for row in dataset] + unique = set(class_ids) + lookup = dict() + print("Class IDs:") + for i, value in enumerate(unique): + lookup[value] = i + print('[%s] => %d' % (value, i)) + for row in dataset: + row[column] = lookup[row[column]] + return lookup + + +# split a dataset into n folds +def cross_validation_split(dataset, n_folds): + dataset_split = list() + dataset_copy = list(dataset) + fold_size = int(len(dataset) / n_folds) + for _ in range(n_folds): + fold = list() + while len(fold) < fold_size: + index = randrange(len(dataset_copy)) + fold.append(dataset_copy.pop(index)) + dataset_split.append(fold) + # print(dataset_split) + return dataset_split + +# calculate accuracy (in per cent) +def accuracy_metric(actual, predicted): + correct = 0 + for i in range(len(actual)): + if actual[i] == predicted[i]: + correct += 1 + return correct / float(len(actual)) * 100.0 + +# evaluate using a cross validation split +def evaluate_algorithm(dataset, n_folds, algorithm, *args): + folds = cross_validation_split(dataset, n_folds) + scores = list() + for fold in folds: + train_set = list(folds) + train_set.remove(fold) + train_set = sum(train_set, []) + test_set = list() + for row in fold: + row_copy = list(row) + # get a row from each fold + test_set.append(row_copy) + # we don't know the class yet + row_copy[-1] = None + predicted = algorithm(train_set, test_set, *args) + actual = [row[-1] for row in fold] + # calculate accuracy + accuracy = accuracy_metric(actual, predicted) + scores.append(accuracy) + return scores + +# split the dataset by class values, return a dictionary +def separate_by_class(dataset): + separated = dict() + for i in range(len(dataset)): + vector = dataset[i] + class_id = vector[-1] + if (class_id not in separated): + separated[class_id] = list() + separated[class_id].append(vector) + return separated + +# mean of a list of numbers +def mean(numbers): + return sum(numbers)/float(len(numbers)) + +# calculate the standard deviation (sigma) of a list of numbers +def stdev(numbers): + avg = mean(numbers) + variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1) + return sqrt(variance) + +# summarize - the mean, standard deviation (sigma) and count for each column in a dataset +def summarize_dataset(dataset): + summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)] + del(summaries[-1]) + return summaries + +# calculate statistics for each row of the dataset split by class +def summarize_by_class(dataset): + separated = separate_by_class(dataset) + summaries = dict() + for class_id, rows in separated.items(): + summaries[class_id] = summarize_dataset(rows) + return summaries + +# Gaussian probability distribution function for x +def gaussian_probability(x, mean, stdev): + exponent = exp(-((x-mean)**2 / (2 * stdev**2 ))) + return (1 / (sqrt(2 * pi) * stdev)) * exponent + +# calculate the probabilities of predicting each class for a given row +def class_probability_predictions(summaries, row): + total_rows = sum([summaries[label][0][2] for label in summaries]) + probabilities = dict() + for class_id, class_summaries in summaries.items(): + # for iris, we only care about 3 classes (0-2) + probabilities[class_id] = summaries[class_id][0][2]/float(total_rows) + for i in range(len(class_summaries)): + mean, stdev, _ = class_summaries[i] + probabilities[class_id] *= gaussian_probability(row[i], mean, stdev) + return probabilities + +# predict the class for a given row +def predict(summaries, row): + probabilities = class_probability_predictions(summaries, row) + best_label, best_probability = None, -1 + for class_id, probability in probabilities.items(): + if best_label is None or probability > best_probability: + best_probability= probability + best_label = class_id + return best_label + +# da thing, the magic +def naive_bayes(train, test): + summarize = summarize_by_class(train) + predictions = list() + for row in test: + output = predict(summarize, row) + predictions.append(output) + return(predictions) + +# run naive bayes on the example dataset +seed(1) +filename = 'iris.txt' +dataset = read_csv(filename) +for i in range(len(dataset[0])-1): + str_column_to_float(dataset, i) + +# convert class column to integers +str_column_to_int(dataset, len(dataset[0])-1) + +# evaluate algorithm +n_folds = 10 +scores = evaluate_algorithm(dataset, n_folds, naive_bayes) +print('Scores: %s' % scores) +print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))