cut and slice weatherAUS dataset for knn
This commit is contained in:
parent
60303aecc9
commit
d252909336
63
task2.py
63
task2.py
@ -288,3 +288,66 @@ row = [2.8,0.7,1.2,0.3]
|
||||
# predict the label
|
||||
label = predict_classification(dataset, row, num_neighbours)
|
||||
print('Data=%s, Predicted: %s' % (row, label))
|
||||
|
||||
|
||||
separator()
|
||||
print('-- Naive Bayes on weather dataset\n')
|
||||
|
||||
|
||||
separator()
|
||||
print('-- kNN on weather dataset\n')
|
||||
|
||||
dontwanna = ['Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm']
|
||||
|
||||
# def read_w_csv(filename):
|
||||
# dataset = list()
|
||||
# with open(filename, 'r') as file:
|
||||
# for row in reader(file):
|
||||
# if row in dontwanna:
|
||||
# continue
|
||||
# if not row:
|
||||
# continue
|
||||
# dataset.append(row)
|
||||
# return dataset
|
||||
|
||||
filename = 'weatherAUS.csv'
|
||||
|
||||
# dataset = read_w_csv(filename)
|
||||
# dataset.drop(columns=dontwanna, inplace=True)
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
dataset=pd.read_csv(filename)
|
||||
with open(filename, 'r') as f:
|
||||
header = next(reader(f))
|
||||
|
||||
|
||||
# rm unneeded columns
|
||||
dataset = pd.read_csv(filename, usecols=list(set(header) - {'Date', 'Location', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindDir9am', 'WindDir3pm'}))
|
||||
print(dataset.head())
|
||||
|
||||
# we need proper boolean values
|
||||
dataset['RainToday'].replace(['Yes'],int("1"),inplace=True,method=...)
|
||||
dataset['RainToday'].replace(['No'],int("0"),inplace=True,method=...)
|
||||
dataset['RainTomorrow'].replace(['Yes'],int("1"),inplace=True,method=...)
|
||||
dataset['RainTomorrow'].replace(['No'],int("0"),inplace=True,method=...)
|
||||
|
||||
print(dataset.head())
|
||||
|
||||
ds = pd.DataFrame(dataset.head(20))
|
||||
# get rid of empty fields (fill them up with column means)
|
||||
for col in ds.columns:
|
||||
ds[col] = ds[col].fillna(np.mean(ds[col]))
|
||||
print(ds.shape)
|
||||
print(ds.head())
|
||||
|
||||
num_neighbours = 3
|
||||
n_folds = 10
|
||||
eval_scores = evaluate_knn_algorithm(ds, k_nearest_neighbours, n_folds, num_neighbours)
|
||||
print('Scores: %s' % eval_scores)
|
||||
print('Mean Accuracy: %.3f%%' % (sum(eval_scores)/float(len(eval_scores))))
|
||||
# predict stuff
|
||||
row = [4,17.5,32.3,1.0,41.0,7.0,20.0,8.0,1,3,5,7,17.8,29.7,0.0,0.2,0]
|
||||
# predict the label
|
||||
label = predict_classification(ds, row, num_neighbours)
|
||||
print('Data=%s, Predicted: %s' % (row, label))
|
142194
weatherAUS.csv
Normal file
142194
weatherAUS.csv
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user