cut and slice weatherAUS dataset for knn

This commit is contained in:
surtur 2021-11-11 23:36:11 +01:00
parent 60303aecc9
commit d252909336
Signed by: wanderer
GPG Key ID: 19CE1EC1D9E0486D
2 changed files with 142257 additions and 0 deletions

@ -287,4 +287,67 @@ print('Mean Accuracy: %.3f%%' % (sum(eval_scores)/float(len(eval_scores))))
row = [2.8,0.7,1.2,0.3]
# predict the label
label = predict_classification(dataset, row, num_neighbours)
print('Data=%s, Predicted: %s' % (row, label))
separator()
print('-- Naive Bayes on weather dataset\n')
separator()
print('-- kNN on weather dataset\n')
dontwanna = ['Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
# def read_w_csv(filename):
# dataset = list()
# with open(filename, 'r') as file:
# for row in reader(file):
# if row in dontwanna:
# continue
# if not row:
# continue
# dataset.append(row)
# return dataset
filename = 'weatherAUS.csv'
# dataset = read_w_csv(filename)
# dataset.drop(columns=dontwanna, inplace=True)
import numpy as np
import pandas as pd
dataset=pd.read_csv(filename)
with open(filename, 'r') as f:
header = next(reader(f))
# rm unneeded columns
dataset = pd.read_csv(filename, usecols=list(set(header) - {'Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm'}))
print(dataset.head())
# we need proper boolean values
dataset['RainToday'].replace(['Yes'],int("1"),inplace=True,method=...)
dataset['RainToday'].replace(['No'],int("0"),inplace=True,method=...)
dataset['RainTomorrow'].replace(['Yes'],int("1"),inplace=True,method=...)
dataset['RainTomorrow'].replace(['No'],int("0"),inplace=True,method=...)
print(dataset.head())
ds = pd.DataFrame(dataset.head(20))
# get rid of empty fields (fill them up with column means)
for col in ds.columns:
ds[col] = ds[col].fillna(np.mean(ds[col]))
print(ds.shape)
print(ds.head())
num_neighbours = 3
n_folds = 10
eval_scores = evaluate_knn_algorithm(ds, k_nearest_neighbours, n_folds, num_neighbours)
print('Scores: %s' % eval_scores)
print('Mean Accuracy: %.3f%%' % (sum(eval_scores)/float(len(eval_scores))))
# predict stuff
row = [4,17.5,32.3,1.0,41.0,7.0,20.0,8.0,1,3,5,7,17.8,29.7,0.0,0.2,0]
# predict the label
label = predict_classification(ds, row, num_neighbours)
print('Data=%s, Predicted: %s' % (row, label))

142194
weatherAUS.csv Normal file

File diff suppressed because it is too large Load Diff