183 lines
3.6 KiB
Go
183 lines
3.6 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/sjwhitworth/golearn/base"
|
|
"github.com/sjwhitworth/golearn/trees"
|
|
"golang.org/x/exp/rand"
|
|
"gonum.org/v1/gonum/stat/distuv"
|
|
)
|
|
|
|
var (
|
|
fname = "data.csv"
|
|
outliers = 15
|
|
randomDataSize = 10001
|
|
)
|
|
|
|
func main() {
|
|
data, err := loadData()
|
|
if err != nil {
|
|
log.Fatalln(err)
|
|
}
|
|
|
|
log.Printf(
|
|
"Train Isolation Forest model (nTrees: %d, maxDepth: %d, subSpace: %d) on the data\n",
|
|
1000, 1000, 7500,
|
|
)
|
|
|
|
predictions := train(data)
|
|
|
|
log.Println("Calculate avg/min scores")
|
|
|
|
avg := 0.0
|
|
minScore := 1.0
|
|
|
|
for i := 0; i < randomDataSize; i++ {
|
|
tmp := predictions[i]
|
|
|
|
avg += tmp
|
|
|
|
if tmp < minScore {
|
|
minScore = tmp
|
|
}
|
|
}
|
|
|
|
fmt.Printf("\tAverage anomaly score for normal data: %f\n",
|
|
avg/float64(randomDataSize),
|
|
)
|
|
fmt.Printf("\tMinimum anomaly score for normal data: %f\n",
|
|
minScore,
|
|
)
|
|
|
|
// these values should be much higher as comapred to the scores for normal
|
|
// data.
|
|
fmt.Println("\tAnomaly scores for outliers are:")
|
|
for i := randomDataSize; i < (randomDataSize + outliers); i++ {
|
|
fmt.Print("\t")
|
|
fmt.Println(predictions[i])
|
|
}
|
|
|
|
log.Println("we're done")
|
|
}
|
|
|
|
func train(data *base.DenseInstances) []float64 {
|
|
// get a new Isolation Forest with 700 trees, max depth 700 and each tree
|
|
// using 7500 datapoints.
|
|
forest := trees.NewIsolationForest(1000, 1000, 7500)
|
|
|
|
// fit the isolation forest to the data.
|
|
forest.Fit(data)
|
|
|
|
// return predictions.
|
|
return forest.Predict(data)
|
|
}
|
|
|
|
func loadData() (*base.DenseInstances, error) {
|
|
// generate and save random data, along with known outlier values.
|
|
err := prepData(fname)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
data, err := base.ParseCSVToInstances(fname, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return data, nil
|
|
}
|
|
|
|
// prepData generates and saves random data (along with some known outliers) to
|
|
// a file in CSV format.
|
|
func prepData(path string) error {
|
|
log.Println("generating data")
|
|
data := genData(true, randomDataSize, -1.0, 1.0)
|
|
log.Println("generating data - done")
|
|
|
|
log.Printf("saving data to file at '%s'\n", path)
|
|
f, err := os.Create(path)
|
|
if err != nil {
|
|
log.Printf("could not save data to file at '%s'\n", path)
|
|
return err
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
w := csv.NewWriter(f)
|
|
|
|
defer w.Flush()
|
|
|
|
log.Println("writing data")
|
|
|
|
err = w.WriteAll(data)
|
|
if err != nil {
|
|
log.Println("error writing data")
|
|
return err
|
|
}
|
|
|
|
log.Println("writing data - done")
|
|
|
|
return nil
|
|
}
|
|
|
|
// genData generates new random data with either normal or uniform
|
|
// distribution. if normal is set, normal distribution is set with sigma and mu
|
|
// values corresponding to the standard normal distribution and min/max values
|
|
// are ignored.
|
|
func genData(normal bool, size int, min, max float64) [][]string {
|
|
col1 := make([]float64, size)
|
|
col2 := make([]float64, size)
|
|
|
|
switch {
|
|
case !normal:
|
|
uniform := &distuv.Uniform{
|
|
Min: min,
|
|
Max: max,
|
|
Src: rand.NewSource(uint64(
|
|
time.Now().UnixNano(),
|
|
)),
|
|
}
|
|
|
|
for i := 0; i < size; i++ {
|
|
col1[i] = uniform.Rand()
|
|
col2[i] = uniform.Rand()
|
|
}
|
|
|
|
case normal:
|
|
stdnorm := &distuv.Normal{
|
|
Sigma: 1,
|
|
Mu: 0,
|
|
Src: rand.NewSource(uint64(
|
|
time.Now().UnixNano(),
|
|
)),
|
|
}
|
|
|
|
for i := 0; i < size; i++ {
|
|
col1[i] = stdnorm.Rand()
|
|
col2[i] = stdnorm.Rand()
|
|
}
|
|
}
|
|
|
|
for i := 0; i < outliers; i++ {
|
|
col1 = append(col1, float64(rand.Int63()))
|
|
col2 = append(col2, float64(rand.Int63()))
|
|
}
|
|
|
|
data := make([][]string, 0, size+outliers)
|
|
|
|
for i := 0; i < size+outliers; i++ {
|
|
r1 := strconv.FormatFloat(col1[i], 'f', -1, 64)
|
|
r2 := strconv.FormatFloat(col2[i], 'f', -1, 64)
|
|
|
|
data = append(data, []string{r1, r2})
|
|
}
|
|
|
|
return data
|
|
}
|