2023-05-14 16:47:01 +02:00
|
|
|
package main
|
|
|
|
|
2023-05-14 19:55:21 +02:00
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"log"
|
|
|
|
"os"
|
|
|
|
|
|
|
|
"github.com/sjwhitworth/golearn/base"
|
|
|
|
"github.com/sjwhitworth/golearn/trees"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
fname = "data.csv"
|
|
|
|
outliers = 15
|
|
|
|
randomDataSize = 10001
|
|
|
|
)
|
|
|
|
|
2023-05-14 16:47:01 +02:00
|
|
|
func main() {
|
2023-05-14 19:55:21 +02:00
|
|
|
data, err := loadData()
|
|
|
|
if err != nil {
|
|
|
|
log.Fatalln(err)
|
|
|
|
}
|
|
|
|
|
2023-05-14 20:59:29 +02:00
|
|
|
log.Printf(
|
|
|
|
"Train Isolation Forest model (nTrees: %d, maxDepth: %d, subSpace: %d) on the data\n",
|
|
|
|
1000, 1000, 7500,
|
|
|
|
)
|
|
|
|
|
2023-05-14 19:55:21 +02:00
|
|
|
predictions := train(data)
|
2023-05-14 20:59:29 +02:00
|
|
|
|
|
|
|
log.Println("Calculate avg/min scores")
|
|
|
|
|
2023-05-14 19:55:21 +02:00
|
|
|
avg := 0.0
|
|
|
|
minScore := 1.0
|
|
|
|
|
|
|
|
for i := 0; i < randomDataSize; i++ {
|
|
|
|
tmp := predictions[i]
|
|
|
|
|
|
|
|
avg += tmp
|
|
|
|
|
|
|
|
if tmp < minScore {
|
|
|
|
minScore = tmp
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-14 21:03:45 +02:00
|
|
|
fmt.Fprintf(os.Stderr, "\tAverage anomaly score for normal data: %f\n",
|
2023-05-14 19:55:21 +02:00
|
|
|
avg/float64(randomDataSize),
|
|
|
|
)
|
2023-05-14 21:03:45 +02:00
|
|
|
fmt.Fprintf(os.Stderr, "\tMinimum anomaly score for normal data: %f\n",
|
2023-05-14 19:55:21 +02:00
|
|
|
minScore,
|
|
|
|
)
|
|
|
|
|
|
|
|
// these values should be much higher as comapred to the scores for normal
|
|
|
|
// data.
|
2023-05-14 21:03:45 +02:00
|
|
|
fmt.Fprintln(os.Stderr, "\tAnomaly scores for outliers are:")
|
|
|
|
|
2023-05-14 19:55:21 +02:00
|
|
|
for i := randomDataSize; i < (randomDataSize + outliers); i++ {
|
2023-05-14 21:03:45 +02:00
|
|
|
fmt.Fprint(os.Stderr, "\t")
|
|
|
|
fmt.Fprintln(os.Stderr, predictions[i])
|
2023-05-14 19:55:21 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
log.Println("we're done")
|
|
|
|
}
|
|
|
|
|
|
|
|
func train(data *base.DenseInstances) []float64 {
|
|
|
|
// get a new Isolation Forest with 700 trees, max depth 700 and each tree
|
|
|
|
// using 7500 datapoints.
|
|
|
|
forest := trees.NewIsolationForest(1000, 1000, 7500)
|
|
|
|
|
|
|
|
// fit the isolation forest to the data.
|
|
|
|
forest.Fit(data)
|
|
|
|
|
|
|
|
// return predictions.
|
|
|
|
return forest.Predict(data)
|
|
|
|
}
|
|
|
|
|
|
|
|
func loadData() (*base.DenseInstances, error) {
|
|
|
|
// generate and save random data, along with known outlier values.
|
|
|
|
err := prepData(fname)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
data, err := base.ParseCSVToInstances(fname, false)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
return data, nil
|
|
|
|
}
|