!pip install dtaidistance
!pip install cesium
Requirement already satisfied: dtaidistance in /usr/local/lib/python3.7/dist-packages (2.2.5)
Requirement already satisfied: cython>=0.29.6 in /usr/local/lib/python3.7/dist-packages (from dtaidistance) (0.29.22)
Collecting cesium
  Downloading https://files.pythonhosted.org/packages/c8/73/c998e9983df653e49238a7da0956bb375b23d0559f01f411cca2220a44f7/cesium-0.9.12-cp37-cp37m-manylinux1_x86_64.whl (220kB)
     |████████████████████████████████| 225kB 4.2MB/s 
Requirement already satisfied: scikit-learn>=0.22.1 in /usr/local/lib/python3.7/dist-packages (from cesium) (0.22.2.post1)
Requirement already satisfied: dask>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from cesium) (2.12.0)
Requirement already satisfied: scipy>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from cesium) (1.4.1)
Requirement already satisfied: toolz in /usr/local/lib/python3.7/dist-packages (from cesium) (0.11.1)
Requirement already satisfied: joblib>=0.14.1 in /usr/local/lib/python3.7/dist-packages (from cesium) (1.0.1)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.7/dist-packages (from cesium) (1.3.0)
Collecting gatspy>=0.3.0
  Downloading https://files.pythonhosted.org/packages/b0/fa/a075f6cd3f40255a883e8a966df17322825f6b86cb4907edce06098aa566/gatspy-0.3.tar.gz (554kB)
     |████████████████████████████████| 563kB 38.3MB/s 
Requirement already satisfied: pandas>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from cesium) (1.1.5)
Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.22.1->cesium) (1.19.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.17.0->cesium) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.17.0->cesium) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->cesium) (1.15.0)
Building wheels for collected packages: gatspy
  Building wheel for gatspy (setup.py) ... done
  Created wheel for gatspy: filename=gatspy-0.3-cp37-none-any.whl size=43807 sha256=e10b8fe52b1f331e71f36551ccb11fda3515a6a81a1243a9f52069a0cb10c1fd
  Stored in directory: /root/.cache/pip/wheels/4f/8f/fa/0d7b250ef21828ca373b21f6b3b6ef0f2a0e3560b69c91e55d
Successfully built gatspy
Installing collected packages: gatspy, cesium
Successfully installed cesium-0.9.12 gatspy-0.3
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

from math import sqrt

from datetime import datetime
import pandas as pd
import numpy as np
import pdb


from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

from sklearn.metrics.pairwise import pairwise_distances
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import homogeneity_score, completeness_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.cluster import homogeneity_score

from dtaidistance import dtw

from collections import Counter

from scipy.stats import pearsonr

The data

words = pd.read_csv('https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/50words_TEST.csv',
                    header = None)
words.rename(columns = {0:'word'}, inplace = True) 
words.head()
word 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ... 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
0 4 -0.89094 -0.86099 -0.82438 -0.78214 -0.73573 -0.68691 -0.63754 -0.589370 -0.543420 -0.500440 -0.460820 -0.42469 -0.39240 -0.36389 -0.33906 -0.31795 -0.300560 -0.286920 -0.277270 -0.271050 -0.267770 -0.266780 -0.267420 -0.269270 -0.272020 -0.275290 -0.278980 -0.283200 -0.288140 -0.294310 -0.301850 -0.31086 -0.321350 -0.333260 -0.346500 -0.361030 -0.37659 -0.39297 -0.40995 ... 0.052471 -0.01492 -0.082568 -0.14848 -0.21193 -0.27229 -0.32905 -0.38303 -0.432220 -0.476280 -0.51587 -0.55188 -0.58534 -0.61763 -0.64653 -0.67227 -0.69521 -0.71576 -0.73517 -0.75286 -0.76798 -0.78120 -0.79326 -0.80474 -0.81719 -0.82831 -0.83768 -0.84538 -0.85165 -0.85731 -0.86309 -0.86791 -0.87271 -0.87846 -0.88592 -0.89619 -0.90783 -0.91942 -0.93018 -0.93939
1 12 -0.78346 -0.68562 -0.58409 -0.47946 -0.37398 -0.27008 -0.17225 -0.087463 -0.019191 0.035301 0.080601 0.12121 0.16167 0.20223 0.23973 0.27386 0.305240 0.334170 0.364750 0.399670 0.436950 0.474700 0.509190 0.534400 0.547550 0.549650 0.542360 0.529090 0.515230 0.503070 0.491830 0.48213 0.472450 0.459290 0.443090 0.424040 0.40053 0.37585 0.35189 ... 0.711070 0.64773 0.579890 0.50674 0.42767 0.34155 0.24849 0.15221 0.056228 -0.036684 -0.12352 -0.20472 -0.28299 -0.35917 -0.43397 -0.50794 -0.57864 -0.64302 -0.70017 -0.74901 -0.78785 -0.81818 -0.84200 -0.85809 -0.86830 -0.87519 -0.87721 -0.87661 -0.87728 -0.87899 -0.88318 -0.89189 -0.90290 -0.91427 -0.92668 -0.93966 -0.95244 -0.96623 -0.98050 -0.99178
2 13 -1.32560 -1.28430 -1.21970 -1.15670 -1.09980 -1.04960 -1.01550 -0.996720 -0.985040 -0.971990 -0.964030 -0.96355 -0.96236 -0.95077 -0.91733 -0.87105 -0.817930 -0.760640 -0.701060 -0.644150 -0.594710 -0.529310 -0.409210 -0.205990 0.066333 0.344810 0.565100 0.703590 0.785270 0.840450 0.870940 0.85764 0.775790 0.634510 0.480380 0.351620 0.24100 0.11593 -0.02214 ... -0.443510 -0.43741 -0.444480 -0.45708 -0.46691 -0.47419 -0.47495 -0.46916 -0.456730 -0.439730 -0.42135 -0.40614 -0.40352 -0.41293 -0.42965 -0.44611 -0.45623 -0.46585 -0.48431 -0.52010 -0.56853 -0.61788 -0.65943 -0.68651 -0.70204 -0.71318 -0.72646 -0.74568 -0.77144 -0.80390 -0.83499 -0.86204 -0.88559 -0.90454 -0.93353 -0.99135 -1.06910 -1.13680 -1.19800 -1.27000
3 23 -1.09370 -1.04200 -0.99840 -0.95997 -0.93997 -0.93764 -0.92649 -0.857090 -0.693320 -0.312890 0.339420 0.98909 1.33000 1.34950 1.22290 1.04450 0.829670 0.602100 0.365250 0.128330 -0.046387 -0.165640 -0.265030 -0.362500 -0.449520 -0.510830 -0.562550 -0.608530 -0.635880 -0.632320 -0.597000 -0.52032 -0.427310 -0.291990 -0.066752 0.241280 0.61050 0.96181 1.21160 ... 1.115600 1.76630 2.558000 3.33430 3.84940 3.90180 3.62190 3.18950 2.623000 2.031000 1.49790 1.00600 0.55445 0.16993 -0.12355 -0.31586 -0.43344 -0.50570 -0.53268 -0.53688 -0.54374 -0.56792 -0.60857 -0.63638 -0.64471 -0.64440 -0.65382 -0.66957 -0.69135 -0.71082 -0.72810 -0.74512 -0.76376 -0.78068 -0.80593 -0.84350 -0.89531 -0.96052 -1.05090 -1.12830
4 4 -0.90138 -0.85228 -0.80196 -0.74932 -0.69298 -0.63316 -0.57038 -0.506920 -0.446040 -0.390180 -0.339310 -0.29231 -0.24833 -0.20635 -0.16585 -0.12719 -0.091931 -0.059946 -0.030921 -0.004628 0.018364 0.038239 0.054732 0.067206 0.075442 0.080507 0.083344 0.083628 0.081179 0.075684 0.066456 0.05167 0.032078 0.008739 -0.017323 -0.046021 -0.07580 -0.10581 -0.13622 ... -0.394410 -0.37672 -0.357160 -0.33905 -0.32255 -0.30807 -0.29818 -0.29607 -0.300260 -0.310800 -0.32809 -0.35262 -0.38035 -0.41034 -0.44236 -0.47573 -0.50864 -0.54145 -0.57506 -0.60889 -0.64237 -0.67593 -0.71042 -0.74502 -0.77959 -0.81402 -0.84805 -0.87959 -0.90785 -0.93276 -0.95452 -0.97322 -0.98984 -1.00520 -1.01880 -1.02960 -1.03700 -1.04110 -1.04180 -1.04030

5 rows × 271 columns

View output

words.word[1]
12
plt.subplot(3, 2, 1)
plt.plot(words.iloc[1, 1:-1])
plt.title("Sample Projection Word " + str(words.word[1]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 2)
plt.hist(words.iloc[1, 1:-1], 10)
plt.title("Histogram of Projection Word " + str(words.word[1]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 3)
plt.plot(words.iloc[3, 1:-1])
plt.title("Sample Projection Word " + str(words.word[3]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 4)
plt.hist(words.iloc[3, 1:-1], 10)
plt.title("Histogram of Projection Word " + str(words.word[3]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 5)
plt.plot(words.iloc[5, 1:-1])
plt.title("Sample Projection Word " + str(words.word[11]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 6)
plt.hist(words.iloc[5, 1:-1], 10)
plt.title("Histogram of Projection Word " + str(words.word[11]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.suptitle("Sample word projections and histograms of the projections", fontsize = 18)
Text(0.5, 0.98, 'Sample word projections and histograms of the projections')
## We can also consider the 2d histogram of a word
x = np.array([])
y = np.array([])

w = 23
selected_words = words[words.word == w]
selected_words.shape

for idx, row in selected_words.iterrows():
    y = np.hstack([y, row[1:271]])
    x = np.hstack([x, np.array(range(270))])
    
fig, ax = plt.subplots()
hist = ax.hist2d(x, y, bins = 50)
plt.xlabel("Time", fontsize = 18)
plt.ylabel("Value", fontsize = 18)
Text(0, 0.5, 'Value')

Generate some features

words.shape
(455, 271)
words_features = words.iloc[:, 1:271]

Create some features from original time series

times  = []
values = []
for idx, row in words_features.iterrows():
    values.append(row.values)
    times.append(np.array([i for i in range(row.values.shape[0])]))
len(values)
455
from cesium import featurize

features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "percent_close_to_median",
                   ]
featurized_words = featurize.featurize_time_series(times=times,
                                              values=values,
                                              errors=None,
                                              features_to_use=features_to_use,
                                              scheduler = None)
featurized_words.to_csv('./featurized_words.csv')
featurized_words = pd.read_csv("./featurized_words.csv", header =  [0, 1])
featurized_words.columns = featurized_words.columns.droplevel(-1)
featurized_words.head()
feature amplitude percent_beyond_1_std percent_close_to_median
0 0 1.674555 0.188889 0.451852
1 1 1.990520 0.118519 0.259259
2 2 2.903650 0.114815 0.637037
3 3 2.515050 0.211111 0.562963
4 4 1.966150 0.181481 0.533333
featurized_words.shape
(455, 4)
plt.hist(featurized_words.percent_beyond_1_std)
(array([ 48., 146., 100.,  45.,  41.,  29.,  23.,  12.,   8.,   3.]),
 array([0.05925926, 0.10666667, 0.15407407, 0.20148148, 0.24888889,
        0.2962963 , 0.3437037 , 0.39111111, 0.43851852, 0.48592593,
        0.53333333]),
 <a list of 10 Patch objects>)

Create some features from histogram

times = []
values = []
for idx, row in words_features.iterrows():
    values.append(np.histogram(row.values, bins=10, range=(-2.5, 5.0))[0] + .0001) ## cesium seems not to handle 0s
    times.append(np.array([i for i in range(9)]))
features_to_use = ["amplitude",
                   "percent_close_to_median",
                  "skew"
                  ]
featurized_hists = featurize.featurize_time_series(times=times,
                                              values=values,
                                              errors=None,
                                              features_to_use=features_to_use,
                                              scheduler = None)
featurized_hists.to_csv('./featurized_hists.csv')
featurized_hists = pd.read_csv("./featurized_hists.csv", header = [0, 1])
featurized_hists.columns = featurized_hists.columns.droplevel(-1)
featurized_hists.head()
feature amplitude percent_close_to_median skew
0 0 88.0 0.444444 2.262655
1 1 61.0 0.666667 1.285343
2 2 70.0 0.666667 1.683031
3 3 67.0 0.555556 1.724109
4 4 75.0 0.777778 1.902513
features = pd.concat([featurized_words.reset_index(drop=True), featurized_hists], axis=1)
features.head()
feature amplitude percent_beyond_1_std percent_close_to_median feature amplitude percent_close_to_median skew
0 0 1.674555 0.188889 0.451852 0 88.0 0.444444 2.262655
1 1 1.990520 0.118519 0.259259 1 61.0 0.666667 1.285343
2 2 2.903650 0.114815 0.637037 2 70.0 0.666667 1.683031
3 3 2.515050 0.211111 0.562963 3 67.0 0.555556 1.724109
4 4 1.966150 0.181481 0.533333 4 75.0 0.777778 1.902513
words.shape
(455, 271)
## we also add some of our own features again, to account more for shape
feats = np.zeros( (words.shape[0], 1), dtype = np.float32)
for i in range(words.shape[0]):
    vals = words.iloc[i, 1:271].values
    feats[i, 0] = np.where(vals == np.max(vals))[0][0]
feats.shape
(455, 1)
features.shape
(455, 8)
features['peak_location'] = feats
features.head()
feature amplitude percent_beyond_1_std percent_close_to_median feature amplitude percent_close_to_median skew peak_location
0 0 1.674555 0.188889 0.451852 0 88.0 0.444444 2.262655 186.0
1 1 1.990520 0.118519 0.259259 1 61.0 0.666667 1.285343 93.0
2 2 2.903650 0.114815 0.637037 2 70.0 0.666667 1.683031 69.0
3 3 2.515050 0.211111 0.562963 3 67.0 0.555556 1.724109 235.0
4 4 1.966150 0.181481 0.533333 4 75.0 0.777778 1.902513 174.0
feature_values = preprocessing.scale(features.iloc[:, [1, 2, 3, 5, 6, 7]])
clustering = AgglomerativeClustering(n_clusters=50, linkage='ward')
clustering.fit(feature_values)
words['feature_label'] = clustering.labels_
words['feature_label'] = words.feature_label.astype('category')
## the number of feature labels 
results = words.groupby('word')['feature_label'].agg(num_clustering_labels=lambda x: len(set(x)),
                                                    num_word_samples=lambda x: len(x),
                                                    most_common_label=lambda x: Counter(x).most_common(1)[0][0])
results.head()
num_clustering_labels num_word_samples most_common_label
word
1 19 57 38
2 19 42 29
3 16 28 37
4 17 34 17
5 13 25 8
## the number of feature labels 
results_feats = words.groupby('feature_label')['word'].agg(num_words=lambda x: len(set(x)),
                                                           num_feat_samples=lambda x: len(x),
                                                           most_common_word=lambda x: Counter(x).most_common(1)[0][0])
results_feats
## note that word 1 = most common in cluster 38
num_words num_feat_samples most_common_word
feature_label
0 9 16 2
1 5 5 13
2 7 8 16
3 8 9 1
4 10 20 1
5 4 8 2
6 5 6 15
7 6 6 7
8 6 17 5
9 10 21 1
10 10 13 3
11 7 13 6
12 6 9 1
13 7 9 2
14 3 5 4
15 11 13 6
16 7 14 12
17 8 18 4
18 6 8 39
19 5 6 15
20 6 9 7
21 7 11 1
22 7 10 20
23 5 9 12
24 5 8 13
25 1 2 4
26 8 21 1
27 6 6 9
28 6 6 8
29 4 9 2
30 4 8 2
31 7 8 19
32 3 4 26
33 7 7 7
34 4 4 3
35 4 6 15
36 6 10 3
37 7 12 3
38 8 19 1
39 6 6 22
40 1 2 20
41 2 2 23
42 2 3 2
43 4 8 1
44 4 5 33
45 4 4 3
46 7 10 11
47 6 13 14
48 3 6 2
49 3 3 9
homogeneity_score(words.word, words.feature_label)
## see definitions in user manual: https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness
0.5085776345414814

Dynamic Time Warping Distance Definition

ts1 = np.sin(np.linspace(1, 10))
ts2 = np.sin(2 * np.linspace(1, 10))
ts3 = np.zeros((50,)) 
plt.plot(ts1)
plt.plot(ts2)
plt.plot(ts3)
[<matplotlib.lines.Line2D at 0x7f94eea30410>]

Exercise: calculate the Euclidean distance between respective pairs of time series from the 3 time series above

np.sqrt(np.sum(np.square(ts1 - ts2)))
7.3897194681883756
np.sqrt(np.sum(np.square(ts1 - ts3)))
4.999710697636168
np.sqrt(np.sum(np.square(ts2 - ts3)))
4.935018175874873
np.linspace(1,10).shape
(50,)
np.random.seed(215202)
ts3_noise = np.random.random(ts3.shape)
ts3 = np.zeros((50,)) 
ts3 = ts3 + ts3_noise
pearsonr(ts1, ts2)
(-0.10087714894729656, 0.485772279067736)
pearsonr(ts1, ts3)
(0.19217749375608117, 0.1812125015007657)
pearsonr(ts2, ts3 + np.random.random(ts3.shape))
(0.18666336432272013, 0.19429849424053683)

Exercise: use what we discussed about dynamic programming to code a DTW function

X = words.iloc[:, 1:271].values
def distDTW(ts1, ts2):
    DTW       = np.full((len(ts1) + 1, len(ts2) + 1), 0, dtype = np.float32)
    DTW[:, 0] = np.inf
    DTW[0, :] = np.inf
    DTW[0, 0] = 0

    for i in range(1, len(ts1) + 1):
        for j in range(1, len(ts2) + 1):
            idx1 = i - 1 
            idx2 = j - 1
            
            dist               = (ts1[idx1] - ts2[idx2])**2
            min_preceding_dist = min(DTW[i-1, j],DTW[i, j-1], DTW[i-1, j-1])

            DTW[i, j] = dist + min_preceding_dist

    return sqrt(DTW[len(ts1), len(ts2)])

Exercise: does this fix the problem above noted with the sine curves vs. a straight line?

distDTW(ts1, ts2)
3.7609101849567645
distDTW(ts1, ts3)
3.718500850249092
distDTW(ts2, ts3)
4.163253276127519
distDTW(X[0], X[1])
7.777960024318239
dtw.distance(X[0], X[1])
## worth checking out: https://github.com/wannesm/dtaidistance
7.777960164340302
p = pairwise_distances(X, metric = distDTW)
with open("pairwise_word_distances.npy", "wb") as f:
    np.save(f, p)
p = np.load("pairwise_word_distances.npy")

Exercise: Try clustering based on dynamic time warping distances

## We will use hierarchical clustering as a distance agnostic methodology
clustering = AgglomerativeClustering(linkage='average', n_clusters=50, affinity = 'precomputed') 
## 'average' linkage is good for non Euclidean distance metrics
labels = clustering.fit_predict(p)
len(words.word)
455
len(labels)
455

Exercise: How did the clustering perform?

print(homogeneity_score(words.word, labels))
print(completeness_score(words.word, labels))
0.6650054500077724
0.771348780383979
# quoting: https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness
# homogeneity: each cluster contains only members of a single class.
# completeness: all members of a given class are assigned to the same cluster.
res = contingency_matrix(labels, words.word)
## note difficulties in assessing this given imbalanced dataset
plt.imshow(res)
<matplotlib.image.AxesImage at 0x7f94eeaa4790>