1-NN with SAX + MINDIST

This example presents a comparison between k-Nearest Neighbor runs with k=1. It compares the use of: * MINDIST (see [1]) on SAX representations of the data. * Euclidean distance on the raw values of the time series.

The comparison is based on test accuracy using several benchmark datasets.

[1] Lin, Jessica, et al. “Experiencing SAX: a novel symbolic

representation of time series.” Data Mining and knowledge discovery 15.2 (2007): 107-144.

|      dataset       | sax error  |  sax time  | eucl error | eucl time  |
--------------------------------------------------------------------------
|    SyntheticControl|        0.03|     3.48727|        0.12|     1.03348|
|            GunPoint|     0.20667|     1.88121|     0.08667|     0.73752|
|            FaceFour|     0.14773|     2.17096|     0.21591|     0.90353|
|          Lightning2|     0.19672|     3.92335|      0.2459|     1.71236|
|          Lightning7|     0.46575|     2.47131|     0.42466|     1.06485|
|              ECG200|        0.12|     1.26332|        0.12|     0.50789|
|               Plane|     0.04762|     1.89187|      0.0381|     0.74948|
|                 Car|        0.35|     3.61388|     0.26667|     1.54936|
|                Beef|     0.53333|     1.61011|     0.33333|     0.65551|
|              Coffee|     0.46429|     0.87851|         0.0|     0.37575|
|            OliveOil|     0.83333|     1.79912|     0.13333|     0.77401|
--------------------------------------------------------------------------

# Author: Gilles Vandewiele
# License: BSD 3 clause

import warnings
import time

import numpy
import matplotlib.pyplot as plt
from scipy.stats import norm

from tslearn.datasets import UCR_UEA_datasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.neighbors import KNeighborsTimeSeriesClassifier

from sklearn.base import clone
from sklearn.metrics import pairwise_distances, accuracy_score
from sklearn.neighbors import KNeighborsClassifier


warnings.filterwarnings('ignore')


def print_table(accuracies, times):
    """Utility function to pretty print the obtained accuracies"""
    header_str = '|'
    header_str += '{:^20}|'.format('dataset')
    columns = ['sax error', 'sax time', 'eucl error', 'eucl time']
    for col in columns:
        header_str += '{:^12}|'.format(col)
    print(header_str)
    print('-'*(len(columns) * 13 + 22))

    for dataset in accuracies:
        acc_sax, acc_euclidean = accuracies[dataset]
        time_sax, time_euclidean = times[dataset]
        sax_error = numpy.around(1 - acc_sax, 5)
        eucl_error = numpy.around(1 - acc_euclidean, 5)
        time_sax = numpy.around(time_sax, 5)
        time_euclidean = numpy.around(time_euclidean, 5)
        s = '|'
        s += '{:>20}|'.format(dataset)
        s += '{:>12}|'.format(sax_error)
        s += '{:>12}|'.format(time_sax)
        s += '{:>12}|'.format(eucl_error)
        s += '{:>12}|'.format(time_euclidean)
        print(s.strip())

    print('-'*(len(columns) * 13 + 22))


# Set seed
numpy.random.seed(0)

# Defining dataset and the number of segments
data_loader = UCR_UEA_datasets()
datasets = [
    ('SyntheticControl', 16),
    ('GunPoint', 64),
    ('FaceFour', 128),
    ('Lightning2', 256),
    ('Lightning7', 128),
    ('ECG200', 32),
    ('Plane', 64),
    ('Car', 256),
    ('Beef', 128),
    ('Coffee', 128),
    ('OliveOil', 256)
]

# We will compare the accuracies & execution times of 1-NN using:
# (i) MINDIST on SAX representations, and
# (ii) euclidean distance on raw values
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax')
knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean')

accuracies = {}
times = {}
for dataset, w in datasets:
    X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset)

    ts_scaler = TimeSeriesScalerMeanVariance()
    X_train = ts_scaler.fit_transform(X_train)
    X_test = ts_scaler.fit_transform(X_test)

    # Fit 1-NN using SAX representation & MINDIST
    metric_params = {'n_segments': w, 'alphabet_size_avg': 10}
    knn_sax = clone(knn_sax).set_params(metric_params=metric_params)
    start = time.time()
    knn_sax.fit(X_train, y_train)
    acc_sax = accuracy_score(y_test, knn_sax.predict(X_test))
    time_sax = time.time() - start

    # Fit 1-NN using euclidean distance on raw values
    start = time.time()
    knn_eucl.fit(X_train, y_train)
    acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test))
    time_euclidean = time.time() - start

    accuracies[dataset] = (acc_sax, acc_euclidean)
    times[dataset] = (time_sax, time_euclidean)

print_table(accuracies, times)

Total running time of the script: (0 minutes 58.611 seconds)

Gallery generated by Sphinx-Gallery