cut and slice weatherAUS dataset for knn

2021-11-11 23:36:11 +01:00 · 2021-11-11 23:36:11 +01:00 · d252909336
commit d252909336
parent 60303aecc9
2 changed files with 142257 additions and 0 deletions
--- a/task2.py
+++ b/task2.py
@ -287,4 +287,67 @@ print('Mean Accuracy: %.3f%%' % (sum(eval_scores)/float(len(eval_scores))))
 row = [2.8,0.7,1.2,0.3]
 # predict the label
 label = predict_classification(dataset, row, num_neighbours)
+print('Data=%s, Predicted: %s' % (row, label))
+
+
+separator()
+print('-- Naive Bayes on weather dataset\n')
+
+
+separator()
+print('-- kNN on weather dataset\n')
+
+dontwanna = ['Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
+
+# def read_w_csv(filename):
+# 	dataset = list()
+# 	with open(filename, 'r') as file:
+# 		for row in reader(file):
+# 			if row in dontwanna:
+# 				continue
+# 			if not row:
+# 				continue
+# 			dataset.append(row)
+# 	return dataset
+
+filename = 'weatherAUS.csv'
+
+# dataset = read_w_csv(filename)
+# dataset.drop(columns=dontwanna, inplace=True)
+
+import numpy as np
+import pandas as pd
+dataset=pd.read_csv(filename)
+with open(filename, 'r') as f:
+	header = next(reader(f))
+
+
+# rm unneeded columns
+dataset = pd.read_csv(filename, usecols=list(set(header) - {'Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm'}))
+print(dataset.head())
+
+# we need proper boolean values
+dataset['RainToday'].replace(['Yes'],int("1"),inplace=True,method=...)
+dataset['RainToday'].replace(['No'],int("0"),inplace=True,method=...)
+dataset['RainTomorrow'].replace(['Yes'],int("1"),inplace=True,method=...)
+dataset['RainTomorrow'].replace(['No'],int("0"),inplace=True,method=...)
+
+print(dataset.head())
+
+ds = pd.DataFrame(dataset.head(20))
+# get rid of empty fields (fill them up with column means)
+for col in ds.columns:
+	ds[col] = ds[col].fillna(np.mean(ds[col]))
+print(ds.shape)
+print(ds.head())
+
+num_neighbours = 3
+n_folds = 10
+eval_scores = evaluate_knn_algorithm(ds, k_nearest_neighbours, n_folds, num_neighbours)
+print('Scores: %s' % eval_scores)
+print('Mean Accuracy: %.3f%%' % (sum(eval_scores)/float(len(eval_scores))))
+# predict stuff
+row = [4,17.5,32.3,1.0,41.0,7.0,20.0,8.0,1,3,5,7,17.8,29.7,0.0,0.2,0]
+# predict the label
+label = predict_classification(ds, row, num_neighbours)
 print('Data=%s, Predicted: %s' % (row, label))
--- a/weatherAUS.csv
+++ b/weatherAUS.csv