!pip install dtaidistance
!pip install cesium

Requirement already satisfied: dtaidistance in /usr/local/lib/python3.7/dist-packages (2.2.5)
Requirement already satisfied: cython>=0.29.6 in /usr/local/lib/python3.7/dist-packages (from dtaidistance) (0.29.22)
Collecting cesium
  Downloading https://files.pythonhosted.org/packages/c8/73/c998e9983df653e49238a7da0956bb375b23d0559f01f411cca2220a44f7/cesium-0.9.12-cp37-cp37m-manylinux1_x86_64.whl (220kB)
     |████████████████████████████████| 225kB 4.2MB/s 
Requirement already satisfied: scikit-learn>=0.22.1 in /usr/local/lib/python3.7/dist-packages (from cesium) (0.22.2.post1)
Requirement already satisfied: dask>=2.5.0 in /usr/local/lib/python3.7/dist-packages (from cesium) (2.12.0)
Requirement already satisfied: scipy>=0.16.0 in /usr/local/lib/python3.7/dist-packages (from cesium) (1.4.1)
Requirement already satisfied: toolz in /usr/local/lib/python3.7/dist-packages (from cesium) (0.11.1)
Requirement already satisfied: joblib>=0.14.1 in /usr/local/lib/python3.7/dist-packages (from cesium) (1.0.1)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.7/dist-packages (from cesium) (1.3.0)
Collecting gatspy>=0.3.0
  Downloading https://files.pythonhosted.org/packages/b0/fa/a075f6cd3f40255a883e8a966df17322825f6b86cb4907edce06098aa566/gatspy-0.3.tar.gz (554kB)
     |████████████████████████████████| 563kB 38.3MB/s 
Requirement already satisfied: pandas>=0.17.0 in /usr/local/lib/python3.7/dist-packages (from cesium) (1.1.5)
Requirement already satisfied: numpy>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.22.1->cesium) (1.19.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.17.0->cesium) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.17.0->cesium) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.17.0->cesium) (1.15.0)
Building wheels for collected packages: gatspy
  Building wheel for gatspy (setup.py) ... done
  Created wheel for gatspy: filename=gatspy-0.3-cp37-none-any.whl size=43807 sha256=e10b8fe52b1f331e71f36551ccb11fda3515a6a81a1243a9f52069a0cb10c1fd
  Stored in directory: /root/.cache/pip/wheels/4f/8f/fa/0d7b250ef21828ca373b21f6b3b6ef0f2a0e3560b69c91e55d
Successfully built gatspy
Installing collected packages: gatspy, cesium
Successfully installed cesium-0.9.12 gatspy-0.3

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

from math import sqrt

from datetime import datetime
import pandas as pd
import numpy as np
import pdb


from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

from sklearn.metrics.pairwise import pairwise_distances
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import homogeneity_score, completeness_score
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.cluster import homogeneity_score

from dtaidistance import dtw

from collections import Counter

from scipy.stats import pearsonr

The data

words = pd.read_csv('https://raw.githubusercontent.com/AileenNielsen/TimeSeriesAnalysisWithPython/master/data/50words_TEST.csv',
                    header = None)

words.rename(columns = {0:'word'}, inplace = True)

words.head()

View output

words.word[1]

12

plt.subplot(3, 2, 1)
plt.plot(words.iloc[1, 1:-1])
plt.title("Sample Projection Word " + str(words.word[1]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 2)
plt.hist(words.iloc[1, 1:-1], 10)
plt.title("Histogram of Projection Word " + str(words.word[1]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 3)
plt.plot(words.iloc[3, 1:-1])
plt.title("Sample Projection Word " + str(words.word[3]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 4)
plt.hist(words.iloc[3, 1:-1], 10)
plt.title("Histogram of Projection Word " + str(words.word[3]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 5)
plt.plot(words.iloc[5, 1:-1])
plt.title("Sample Projection Word " + str(words.word[11]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.subplot(3, 2, 6)
plt.hist(words.iloc[5, 1:-1], 10)
plt.title("Histogram of Projection Word " + str(words.word[11]), fontweight = 'bold', y = 0.8, fontsize = 14)
plt.suptitle("Sample word projections and histograms of the projections", fontsize = 18)

Text(0.5, 0.98, 'Sample word projections and histograms of the projections')

## We can also consider the 2d histogram of a word
x = np.array([])
y = np.array([])

w = 23
selected_words = words[words.word == w]
selected_words.shape

for idx, row in selected_words.iterrows():
    y = np.hstack([y, row[1:271]])
    x = np.hstack([x, np.array(range(270))])
    
fig, ax = plt.subplots()
hist = ax.hist2d(x, y, bins = 50)
plt.xlabel("Time", fontsize = 18)
plt.ylabel("Value", fontsize = 18)

Text(0, 0.5, 'Value')

Generate some features

words.shape

(455, 271)

words_features = words.iloc[:, 1:271]

Create some features from original time series

times  = []
values = []
for idx, row in words_features.iterrows():
    values.append(row.values)
    times.append(np.array([i for i in range(row.values.shape[0])]))

len(values)

455

from cesium import featurize

features_to_use = ["amplitude",
                   "percent_beyond_1_std",
                   "percent_close_to_median",
                   ]
featurized_words = featurize.featurize_time_series(times=times,
                                              values=values,
                                              errors=None,
                                              features_to_use=features_to_use,
                                              scheduler = None)
featurized_words.to_csv('./featurized_words.csv')

featurized_words = pd.read_csv("./featurized_words.csv", header =  [0, 1])
featurized_words.columns = featurized_words.columns.droplevel(-1)

featurized_words.head()

featurized_words.shape

(455, 4)

plt.hist(featurized_words.percent_beyond_1_std)

(array([ 48., 146., 100.,  45.,  41.,  29.,  23.,  12.,   8.,   3.]),
 array([0.05925926, 0.10666667, 0.15407407, 0.20148148, 0.24888889,
        0.2962963 , 0.3437037 , 0.39111111, 0.43851852, 0.48592593,
        0.53333333]),
 <a list of 10 Patch objects>)

Create some features from histogram

times = []
values = []
for idx, row in words_features.iterrows():
    values.append(np.histogram(row.values, bins=10, range=(-2.5, 5.0))[0] + .0001) ## cesium seems not to handle 0s
    times.append(np.array([i for i in range(9)]))

features_to_use = ["amplitude",
                   "percent_close_to_median",
                  "skew"
                  ]
featurized_hists = featurize.featurize_time_series(times=times,
                                              values=values,
                                              errors=None,
                                              features_to_use=features_to_use,
                                              scheduler = None)
featurized_hists.to_csv('./featurized_hists.csv')

featurized_hists = pd.read_csv("./featurized_hists.csv", header = [0, 1])
featurized_hists.columns = featurized_hists.columns.droplevel(-1)

featurized_hists.head()

features = pd.concat([featurized_words.reset_index(drop=True), featurized_hists], axis=1)

features.head()

words.shape

(455, 271)

## we also add some of our own features again, to account more for shape
feats = np.zeros( (words.shape[0], 1), dtype = np.float32)
for i in range(words.shape[0]):
    vals = words.iloc[i, 1:271].values
    feats[i, 0] = np.where(vals == np.max(vals))[0][0]

feats.shape

(455, 1)

features.shape

(455, 8)

features['peak_location'] = feats

features.head()

feature_values = preprocessing.scale(features.iloc[:, [1, 2, 3, 5, 6, 7]])

clustering = AgglomerativeClustering(n_clusters=50, linkage='ward')
clustering.fit(feature_values)
words['feature_label'] = clustering.labels_

words['feature_label'] = words.feature_label.astype('category')

## the number of feature labels 
results = words.groupby('word')['feature_label'].agg(num_clustering_labels=lambda x: len(set(x)),
                                                    num_word_samples=lambda x: len(x),
                                                    most_common_label=lambda x: Counter(x).most_common(1)[0][0])
results.head()

## the number of feature labels 
results_feats = words.groupby('feature_label')['word'].agg(num_words=lambda x: len(set(x)),
                                                           num_feat_samples=lambda x: len(x),
                                                           most_common_word=lambda x: Counter(x).most_common(1)[0][0])
results_feats
## note that word 1 = most common in cluster 38

homogeneity_score(words.word, words.feature_label)
## see definitions in user manual: https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness

0.5085776345414814

Dynamic Time Warping Distance Definition

ts1 = np.sin(np.linspace(1, 10))
ts2 = np.sin(2 * np.linspace(1, 10))
ts3 = np.zeros((50,)) 
plt.plot(ts1)
plt.plot(ts2)
plt.plot(ts3)

[<matplotlib.lines.Line2D at 0x7f94eea30410>]

Exercise: calculate the Euclidean distance between respective pairs of time series from the 3 time series above

np.sqrt(np.sum(np.square(ts1 - ts2)))

7.3897194681883756

np.sqrt(np.sum(np.square(ts1 - ts3)))

4.999710697636168

np.sqrt(np.sum(np.square(ts2 - ts3)))

4.935018175874873

np.linspace(1,10).shape

(50,)

Another time series clustering technique that has been recommended is a correlation measure. How does this fair in the case of our sine curves and straigh line?

np.random.seed(215202)
ts3_noise = np.random.random(ts3.shape)
ts3 = np.zeros((50,)) 
ts3 = ts3 + ts3_noise

pearsonr(ts1, ts2)

(-0.10087714894729656, 0.485772279067736)

pearsonr(ts1, ts3)

(0.19217749375608117, 0.1812125015007657)

pearsonr(ts2, ts3 + np.random.random(ts3.shape))

(0.18666336432272013, 0.19429849424053683)

Exercise: use what we discussed about dynamic programming to code a DTW function

X = words.iloc[:, 1:271].values

def distDTW(ts1, ts2):
    DTW       = np.full((len(ts1) + 1, len(ts2) + 1), 0, dtype = np.float32)
    DTW[:, 0] = np.inf
    DTW[0, :] = np.inf
    DTW[0, 0] = 0

    for i in range(1, len(ts1) + 1):
        for j in range(1, len(ts2) + 1):
            idx1 = i - 1 
            idx2 = j - 1
            
            dist               = (ts1[idx1] - ts2[idx2])**2
            min_preceding_dist = min(DTW[i-1, j],DTW[i, j-1], DTW[i-1, j-1])

            DTW[i, j] = dist + min_preceding_dist

    return sqrt(DTW[len(ts1), len(ts2)])

Exercise: does this fix the problem above noted with the sine curves vs. a straight line?

distDTW(ts1, ts2)

3.7609101849567645

distDTW(ts1, ts3)

3.718500850249092

distDTW(ts2, ts3)

4.163253276127519

distDTW(X[0], X[1])

7.777960024318239

dtw.distance(X[0], X[1])
## worth checking out: https://github.com/wannesm/dtaidistance

7.777960164340302

p = pairwise_distances(X, metric = distDTW)

with open("pairwise_word_distances.npy", "wb") as f:
    np.save(f, p)

p = np.load("pairwise_word_distances.npy")

Exercise: Try clustering based on dynamic time warping distances

## We will use hierarchical clustering as a distance agnostic methodology

clustering = AgglomerativeClustering(linkage='average', n_clusters=50, affinity = 'precomputed') 
## 'average' linkage is good for non Euclidean distance metrics

labels = clustering.fit_predict(p)

len(words.word)

455

len(labels)

455

Exercise: How did the clustering perform?

print(homogeneity_score(words.word, labels))
print(completeness_score(words.word, labels))

0.6650054500077724
0.771348780383979

# quoting: https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness
# homogeneity: each cluster contains only members of a single class.
# completeness: all members of a given class are assigned to the same cluster.

res = contingency_matrix(labels, words.word)

## note difficulties in assessing this given imbalanced dataset
plt.imshow(res)

<matplotlib.image.AxesImage at 0x7f94eeaa4790>

	word	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	...	231	232	233	234	235	236	237	238	239	240	241	242	243	244	245	246	247	248	249	250	251	252	253	254	255	256	257	258	259	260	261	262	263	264	265	266	267	268	269	270
0	4	-0.89094	-0.86099	-0.82438	-0.78214	-0.73573	-0.68691	-0.63754	-0.589370	-0.543420	-0.500440	-0.460820	-0.42469	-0.39240	-0.36389	-0.33906	-0.31795	-0.300560	-0.286920	-0.277270	-0.271050	-0.267770	-0.266780	-0.267420	-0.269270	-0.272020	-0.275290	-0.278980	-0.283200	-0.288140	-0.294310	-0.301850	-0.31086	-0.321350	-0.333260	-0.346500	-0.361030	-0.37659	-0.39297	-0.40995	...	0.052471	-0.01492	-0.082568	-0.14848	-0.21193	-0.27229	-0.32905	-0.38303	-0.432220	-0.476280	-0.51587	-0.55188	-0.58534	-0.61763	-0.64653	-0.67227	-0.69521	-0.71576	-0.73517	-0.75286	-0.76798	-0.78120	-0.79326	-0.80474	-0.81719	-0.82831	-0.83768	-0.84538	-0.85165	-0.85731	-0.86309	-0.86791	-0.87271	-0.87846	-0.88592	-0.89619	-0.90783	-0.91942	-0.93018	-0.93939
1	12	-0.78346	-0.68562	-0.58409	-0.47946	-0.37398	-0.27008	-0.17225	-0.087463	-0.019191	0.035301	0.080601	0.12121	0.16167	0.20223	0.23973	0.27386	0.305240	0.334170	0.364750	0.399670	0.436950	0.474700	0.509190	0.534400	0.547550	0.549650	0.542360	0.529090	0.515230	0.503070	0.491830	0.48213	0.472450	0.459290	0.443090	0.424040	0.40053	0.37585	0.35189	...	0.711070	0.64773	0.579890	0.50674	0.42767	0.34155	0.24849	0.15221	0.056228	-0.036684	-0.12352	-0.20472	-0.28299	-0.35917	-0.43397	-0.50794	-0.57864	-0.64302	-0.70017	-0.74901	-0.78785	-0.81818	-0.84200	-0.85809	-0.86830	-0.87519	-0.87721	-0.87661	-0.87728	-0.87899	-0.88318	-0.89189	-0.90290	-0.91427	-0.92668	-0.93966	-0.95244	-0.96623	-0.98050	-0.99178
2	13	-1.32560	-1.28430	-1.21970	-1.15670	-1.09980	-1.04960	-1.01550	-0.996720	-0.985040	-0.971990	-0.964030	-0.96355	-0.96236	-0.95077	-0.91733	-0.87105	-0.817930	-0.760640	-0.701060	-0.644150	-0.594710	-0.529310	-0.409210	-0.205990	0.066333	0.344810	0.565100	0.703590	0.785270	0.840450	0.870940	0.85764	0.775790	0.634510	0.480380	0.351620	0.24100	0.11593	-0.02214	...	-0.443510	-0.43741	-0.444480	-0.45708	-0.46691	-0.47419	-0.47495	-0.46916	-0.456730	-0.439730	-0.42135	-0.40614	-0.40352	-0.41293	-0.42965	-0.44611	-0.45623	-0.46585	-0.48431	-0.52010	-0.56853	-0.61788	-0.65943	-0.68651	-0.70204	-0.71318	-0.72646	-0.74568	-0.77144	-0.80390	-0.83499	-0.86204	-0.88559	-0.90454	-0.93353	-0.99135	-1.06910	-1.13680	-1.19800	-1.27000
3	23	-1.09370	-1.04200	-0.99840	-0.95997	-0.93997	-0.93764	-0.92649	-0.857090	-0.693320	-0.312890	0.339420	0.98909	1.33000	1.34950	1.22290	1.04450	0.829670	0.602100	0.365250	0.128330	-0.046387	-0.165640	-0.265030	-0.362500	-0.449520	-0.510830	-0.562550	-0.608530	-0.635880	-0.632320	-0.597000	-0.52032	-0.427310	-0.291990	-0.066752	0.241280	0.61050	0.96181	1.21160	...	1.115600	1.76630	2.558000	3.33430	3.84940	3.90180	3.62190	3.18950	2.623000	2.031000	1.49790	1.00600	0.55445	0.16993	-0.12355	-0.31586	-0.43344	-0.50570	-0.53268	-0.53688	-0.54374	-0.56792	-0.60857	-0.63638	-0.64471	-0.64440	-0.65382	-0.66957	-0.69135	-0.71082	-0.72810	-0.74512	-0.76376	-0.78068	-0.80593	-0.84350	-0.89531	-0.96052	-1.05090	-1.12830
4	4	-0.90138	-0.85228	-0.80196	-0.74932	-0.69298	-0.63316	-0.57038	-0.506920	-0.446040	-0.390180	-0.339310	-0.29231	-0.24833	-0.20635	-0.16585	-0.12719	-0.091931	-0.059946	-0.030921	-0.004628	0.018364	0.038239	0.054732	0.067206	0.075442	0.080507	0.083344	0.083628	0.081179	0.075684	0.066456	0.05167	0.032078	0.008739	-0.017323	-0.046021	-0.07580	-0.10581	-0.13622	...	-0.394410	-0.37672	-0.357160	-0.33905	-0.32255	-0.30807	-0.29818	-0.29607	-0.300260	-0.310800	-0.32809	-0.35262	-0.38035	-0.41034	-0.44236	-0.47573	-0.50864	-0.54145	-0.57506	-0.60889	-0.64237	-0.67593	-0.71042	-0.74502	-0.77959	-0.81402	-0.84805	-0.87959	-0.90785	-0.93276	-0.95452	-0.97322	-0.98984	-1.00520	-1.01880	-1.02960	-1.03700	-1.04110	-1.04180	-1.04030

	feature	amplitude	percent_beyond_1_std	percent_close_to_median
0	0	1.674555	0.188889	0.451852
1	1	1.990520	0.118519	0.259259
2	2	2.903650	0.114815	0.637037
3	3	2.515050	0.211111	0.562963
4	4	1.966150	0.181481	0.533333

	feature	amplitude	percent_close_to_median	skew
0	0	88.0	0.444444	2.262655
1	1	61.0	0.666667	1.285343
2	2	70.0	0.666667	1.683031
3	3	67.0	0.555556	1.724109
4	4	75.0	0.777778	1.902513

	feature	amplitude	percent_beyond_1_std	percent_close_to_median	feature	amplitude	percent_close_to_median	skew
0	0	1.674555	0.188889	0.451852	0	88.0	0.444444	2.262655
1	1	1.990520	0.118519	0.259259	1	61.0	0.666667	1.285343
2	2	2.903650	0.114815	0.637037	2	70.0	0.666667	1.683031
3	3	2.515050	0.211111	0.562963	3	67.0	0.555556	1.724109
4	4	1.966150	0.181481	0.533333	4	75.0	0.777778	1.902513

	feature	amplitude	percent_beyond_1_std	percent_close_to_median	feature	amplitude	percent_close_to_median	skew	peak_location
0	0	1.674555	0.188889	0.451852	0	88.0	0.444444	2.262655	186.0
1	1	1.990520	0.118519	0.259259	1	61.0	0.666667	1.285343	93.0
2	2	2.903650	0.114815	0.637037	2	70.0	0.666667	1.683031	69.0
3	3	2.515050	0.211111	0.562963	3	67.0	0.555556	1.724109	235.0
4	4	1.966150	0.181481	0.533333	4	75.0	0.777778	1.902513	174.0

	num_words	num_feat_samples	most_common_word
feature_label
0	9	16	2
1	5	5	13
2	7	8	16
3	8	9	1
4	10	20	1
5	4	8	2
6	5	6	15
7	6	6	7
8	6	17	5
9	10	21	1
10	10	13	3
11	7	13	6
12	6	9	1
13	7	9	2
14	3	5	4
15	11	13	6
16	7	14	12
17	8	18	4
18	6	8	39
19	5	6	15
20	6	9	7
21	7	11	1
22	7	10	20
23	5	9	12
24	5	8	13
25	1	2	4
26	8	21	1
27	6	6	9
28	6	6	8
29	4	9	2
30	4	8	2
31	7	8	19
32	3	4	26
33	7	7	7
34	4	4	3
35	4	6	15
36	6	10	3
37	7	12	3
38	8	19	1
39	6	6	22
40	1	2	20
41	2	2	23
42	2	3	2
43	4	8	1
44	4	5	33
45	4	4	3
46	7	10	11
47	6	13	14
48	3	6	2
49	3	3	9

	num_clustering_labels	num_word_samples	most_common_label
word
1	19	57	38
2	19	42	29
3	16	28	37
4	17	34	17
5	13	25	8