Minimax Linkage

There is no minimax linkage implemented in scipy.cluster.hierarchy as of Feb 2021. There is a standalone package pyprotoclust, possibly the only python package available for minimax linkage, but a simple pip install doesn’t always work.

Below is a brute force implementation.

There is a nearest neighbor chain algorithm which is used in scipy for complete linkage. Minimax linkage can be implemented in the same way.

The update rule of minimax linkage is

\begin{align*} d(G_1 \cup G_2, H) = \min_{i\in G_1\cup G_2 \cup H} \max_{j\in G_1\cup G_2 \cup H} d_{ij}. \end{align*}

Brute Force

[1]:

%%time

import math
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
from pandas import DataFrame
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist

def minimax_linkage(dist):
    n = int((np.sqrt(8*len(dist) + 1) + 1)/2)

    def d(i, j): return dist[n*i+j-((i+2)*(i+1))//2] if i<j else (0 if i==j else d(j, i))
    def r(i, G): return max(d(i, j) for j in G)

    Z = []
    clusters = {i: set([i]) for i in range(n)}
    for i in range(n-1):
        min_d = math.inf
        for (idxG, G), (idxH, H) in combinations(clusters.items(), 2):
            x, dminimax = min(((x, r(x, G|H)) for x in G|H), key=lambda pt_max_d: pt_max_d[1])
            if dminimax < min_d:
                min_d = dminimax
                to_merge = [idxG, idxH, dminimax, len(G|H), x]
        Z.append(to_merge)
        idxG, idxH, _, _, _ = to_merge
        clusters[n+i] = clusters.pop(idxG) | clusters.pop(idxH)

    return np.array(Z)

X = [[0, 0], [0, 1], [1, 0], [0, 4], [0, 3], [1, 4], [4, 0], [3, 0], [4, 1], [4, 4], [3, 4], [4, 3]]
Z = minimax_linkage(pdist(X))
# Z = hierarchy.complete(pdist(X))

fig, ax = plt.subplots(1, 1, figsize=(5, 8))
hierarchy.dendrogram(Z[:, :4], ax=ax, orientation='left')
ax.set(title='Dendrogram with Minimax Linkage')
plt.show()

DataFrame(Z, columns=['x', 'y', 'dist', 'num_pts', 'prototype']).astype({'x': int, 'y': int, 'num_pts': int, 'prototype': int})

CPU times: user 792 ms, sys: 194 ms, total: 986 ms
Wall time: 999 ms

[1]:

	x	y	dist	num_pts	prototype
0	0	1	1.000000	2	0
1	2	12	1.000000	3	0
2	3	4	1.000000	2	3
3	5	14	1.000000	3	3
4	6	7	1.000000	2	6
5	8	16	1.000000	3	6
6	9	10	1.000000	2	9
7	11	18	1.000000	3	9
8	13	15	3.162278	6	1
9	17	19	3.162278	6	8
10	20	21	5.000000	12	1

Nearest-Neighbor Chain Algorithm 

Part of the code from SciPy.

[2]:

import math
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist


class LinkageUnionFind:
    """Structure for fast cluster labeling in unsorted dendrogram."""
#     cdef int[:] parent
#     cdef int[:] size
#     cdef int next_label

    def __init__(self, n):
        self.parent = np.arange(2 * n - 1)
        self.next_label = n
        self.size = np.ones(2 * n - 1)

    def merge(self, x, y):
        self.parent[x] = self.next_label
        self.parent[y] = self.next_label
        size = self.size[x] + self.size[y]
        self.size[self.next_label] = size
        self.next_label += 1
        return size

    def find(self, x):
        p = x

        while self.parent[x] != x:
            x = self.parent[x]

        while self.parent[p] != x:
            p, self.parent[p] = self.parent[p], x

        return x


def label(Z, n):
    """Correctly label clusters in unsorted dendrogram."""
    uf = LinkageUnionFind(n)
    for i in range(n - 1):
        x, y = int(Z[i, 0]), int(Z[i, 1])
        x_root, y_root = uf.find(x), uf.find(y)
        if x_root < y_root:
            Z[i, 0], Z[i, 1] = x_root, y_root
        else:
            Z[i, 0], Z[i, 1] = y_root, x_root
        Z[i, 3] = uf.merge(x_root, y_root)



def condensed_index(n, i, j):
    """
    Calculate the condensed index of element (i, j) in an n x n condensed
    matrix.
    """
    if i < j:
        return int(round(n * i - (i * (i + 1) / 2) + (j - i - 1)))
    elif i > j:
        return int(round(n * j - (j * (j + 1) / 2) + (i - j - 1)))


# def nn_chain(dists, n, method):
def minimax(dists):
    """Perform hierarchy clustering using nearest-neighbor chain algorithm.
    Parameters
    ----------
    dists : ndarray
        A condensed matrix stores the pairwise distances of the observations.
    Returns
    -------
    Z : ndarray, shape (n - 1, 4)
        Computed linkage matrix.
    """
    n = int((np.sqrt(8*len(dists) + 1) + 1)/2)

    Z_arr = np.empty((n - 1, 5))
    Z = Z_arr

    D = dists.copy()  # Distances between clusters.
    size = np.ones(n, dtype=np.intc)  # Sizes of clusters.

    indices = [set([i]) for i in range(n)]

#     new_dist = linkage_methods[method]

    # Variables to store neighbors chain.
    cluster_chain = np.ndarray(n, dtype=np.intc)
    chain_length = 0

#     cdef int i, j, k, x, y, nx, ny, ni
#     cdef double dist, current_min

    for k in range(n - 1):
        if chain_length == 0:
            chain_length = 1
            for i in range(n):
                if size[i] > 0:
                    cluster_chain[0] = i
                    break

        # Go through chain of neighbors until two mutual neighbors are found.
        while True:
            x = cluster_chain[chain_length - 1]

            # We want to prefer the previous element in the chain as the
            # minimum, to avoid potentially going in cycles.
            if chain_length > 1:
                y = cluster_chain[chain_length - 2]
                current_min = D[condensed_index(n, x, y)]
            else:
                current_min = np.inf   # NPY_INFINITYF

            for i in range(n):
                if size[i] == 0 or x == i:
                    continue

                dist = D[condensed_index(n, x, i)]
                if dist < current_min:
                    current_min = dist
                    y = i

            if chain_length > 1 and y == cluster_chain[chain_length - 2]:
                break

            cluster_chain[chain_length] = y
            chain_length += 1

        # Merge clusters x and y and pop them from stack.
        chain_length -= 2

        # This is a convention used in fastcluster.
        if x > y:
            x, y = y, x

        # get the original numbers of points in clusters x and y
        nx = size[x]
        ny = size[y]

        # merge x and y. Cluster x will be dropped, and y will be replaced with the new cluster
        indices[y] |= indices[x]
        indices[x] = set()

        prototype, minmax = min(((j, max(dists[condensed_index(n, j, k)] if j!=k else 0 for k in indices[y])) for j in indices[y]), key=lambda pt_max: pt_max[1])

        # Record the new node.
        Z[k, 0] = x
        Z[k, 1] = y
        Z[k, 2] = current_min
        Z[k, 3] = nx + ny
        Z[k, 4] = prototype
        size[x] = 0  # Cluster x will be dropped.
        size[y] = nx + ny  # Cluster y will be replaced with the new cluster

        # Update the distance matrix.
        for i in range(n):
            ni = size[i]
            if ni == 0 or i == y:
                continue

            # D[condensed_index(n, i, y)] = max(D[condensed_index(n, i, x)], D[condensed_index(n, i, y)])  # complete linkage

            all_indices = indices[y] | indices[i]
            D[condensed_index(n, i, y)] = min(max(dists[condensed_index(n, j, k)] if j!=k else 0 for k in all_indices) for j in all_indices)  # minimax linkage, 實測先抄到 np.array 再取 minmax 反而更慢

            # 要 implement minimax 需要知道 x y 裡各有哪些原本資料點的 index，
            # 原本的 distance matrix 要從 dists 裡拿，因為 D 裡的值會一直被覆蓋過去

    # Sort Z by cluster distances.
    order = np.argsort(Z_arr[:, 2], kind='mergesort')
    Z_arr = Z_arr[order]

    # Find correct cluster labels inplace.
    label(Z_arr, n)

    return Z_arr


X = [[0, 0], [0, 1], [1, 0], [0, 4], [0, 3], [1, 4], [4, 0], [3, 0], [4, 1], [4, 4], [3, 4], [4, 3]]
Z = minimax(pdist(X))

fig, ax = plt.subplots(1, 1, figsize=(5, 8))
hierarchy.dendrogram(Z[:, :4], ax=ax, orientation='left')
ax.set(title='Dendrogram with Complete Linkage')
plt.show()

DataFrame(Z, columns=['x', 'y', 'dist', 'num_pts', 'prototype']).astype({'x': int, 'y': int, 'num_pts': int, 'prototype': int})

[2]:

	x	y	dist	num_pts	prototype
0	0	1	1.000000	2	0
1	2	12	1.000000	3	0
2	3	4	1.000000	2	3
3	6	7	1.000000	2	6
4	5	14	1.000000	3	3
5	8	15	1.000000	3	6
6	9	10	1.000000	2	9
7	11	18	1.000000	3	9
8	13	16	3.162278	6	1
9	17	19	3.162278	6	8
10	20	21	5.000000	12	1

Performance

用舊的 code 跑出來的（不計算 prototype）

[11]:

%%timeit

# using np.max and np.min

import numpy as np

n = 100
np.random.seed(0)
X = np.random.rand(n, 2)
Z = minimax(pdist(X))

874 ms ± 92.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

[16]:

%%timeit

# using python built-in

import numpy as np

n = 100

np.random.seed(0)
X = np.random.rand(n, 2)
Z = minimax(pdist(X))

504 ms ± 23.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

[2]:

%%time

import time
import numpy as np
from pandas import DataFrame

np.random.seed(0)
n_pts = np.arange(100, 501, 20)
elapsed_time = []

def test_run(n):
    X = np.random.rand(n, 2)
    Z = minimax(pdist(X))

for n in n_pts:
    t = time.process_time()
    test_run(n)
    elapsed_time.append(time.process_time() - t)

df = DataFrame({'n': n_pts, 'time': elapsed_time}).set_index('n')
df.plot(style='.-')
pass

CPU times: user 2min 32s, sys: 127 ms, total: 2min 32s
Wall time: 2min 33s

[3]:

df

[3]:

	time
n
100	0.618942
120	0.868634
140	1.165988
160	1.584192
180	2.195476
200	2.550525
220	3.055468
240	3.760503
260	4.438697
280	5.170326
300	6.014018
320	7.545441
340	7.918529
360	9.657819
380	10.111645
400	10.983310
420	12.148820
440	13.400139
460	15.084997
480	16.067308
500	17.771972

[5]:

from pandas import DataFrame

n = len(X)

DataFrame([[condensed_index(12, i, j) if i>j else '' for j in range(12)] for i in range(12)])

[5]:

	0	1	2	3	4	5	6	7	8	9	10
0
1	0
2	1	11
3	2	12	21
4	3	13	22	30
5	4	14	23	31	38
6	5	15	24	32	39	45
7	6	16	25	33	40	46	51
8	7	17	26	34	41	47	52	56
9	8	18	27	35	42	48	53	57	60
10	9	19	28	36	43	49	54	58	61	63
11	10	20	29	37	44	50	55	59	62	64	65

Demonstration

顏色會亂跳

[11]:

from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import complete, single, dendrogram, fcluster, median
from pandas import DataFrame
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from ipywidgets import interact

data = [[0, 0], [0, 1], [1, 0], [0, 4], [0, 3], [1, 4], [4, 0], [3, 0], [4, 1], [4, 4], [3, 4], [4, 3]]
dist = pdist(data)

Z = complete(dist)
df = DataFrame(data, columns=['x', 'y'])

@interact(cut=(0, 5, 0.2))
def f(cut=1.8):
    df['hue'] = [cm.tab20(idx) for idx in fcluster(Z, t=cut, criterion='distance')]
    fx, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))

    dendrogram(Z, ax=ax1, link_color_func=lambda _: 'black')
    ax1.axhline(y=cut, c='r')

    sns.scatterplot(x='x', y='y', data=df, ax=ax2, hue='hue', legend=None, s=200)
    ax2.set(aspect=1, xlim=(-1, 5), ylim=(-1, 5))
    for i, (x, y) in enumerate(data):
        ax2.text(x+0.2, y+0.2, i)

    plt.show()

[26]:

pip install pyminimax

Requirement already satisfied: pyminimax in /srv/conda/envs/notebook/lib/python3.7/site-packages (0.1.2)
Requirement already satisfied: numpy>=1.6.5 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from pyminimax) (1.21.1)
Requirement already satisfied: scipy>=1.6.0 in /srv/conda/envs/notebook/lib/python3.7/site-packages (from pyminimax) (1.7.1)
Note: you may need to restart the kernel to use updated packages.

[12]:

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from pyminimax import minimax, fcluster_prototype
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram
from ipywidgets import interact
from matplotlib import cm

np.random.seed(0)
X = np.random.rand(20, 2)

Z = minimax(pdist(X), return_prototype=True)

plt.figure(figsize=(10, 4))
dendrogram(Z[:, :4])
plt.show()

max_dist = max(Z[:, 2])

@interact(cut=(0, max_dist+0.1, 0.05))
def f(cut=0.35):
    clust_proto = fcluster_prototype(Z, t=cut, criterion='distance')
    protos = np.unique(clust_proto, axis=0).T[1]

    fig = plt.figure(figsize=(6, 6))
    df = DataFrame(np.concatenate([X, clust_proto], axis=1), columns=['x', 'y', 'clust', 'proto'])
    df['hue'] = df['clust'].map(int).map(cm.tab20)
#     ax = sns.scatterplot(data=df, x='x', y='y', hue='hue', legend=None)
    ax = sns.scatterplot(data=df, x='x', y='y', hue='clust', legend=None)
    ax.set(xlim=(-0.5, 1.5), ylim=(-0.5, 1.5), aspect=1)

    for proto in protos:
        circle = plt.Circle(X[proto], cut, edgecolor='g', facecolor='none', clip_on=False)
        ax.add_patch(circle)
    plt.show()

f(cut=0.3)
f(cut=0.35)

[12]:

DataFrame(Z)

[12]:

	0	1	2	3	4
0	1.0	18.0	0.072653	2.0	1.0
1	2.0	16.0	0.084000	2.0	16.0
2	12.0	17.0	0.101950	2.0	17.0
3	0.0	11.0	0.109071	2.0	0.0
4	3.0	23.0	0.113781	3.0	11.0
5	10.0	19.0	0.122410	2.0	10.0
6	14.0	20.0	0.153313	3.0	1.0
7	8.0	13.0	0.166485	2.0	8.0
8	21.0	26.0	0.167219	5.0	16.0
9	6.0	24.0	0.180002	4.0	11.0
10	15.0	29.0	0.197024	5.0	11.0
11	5.0	28.0	0.205629	6.0	1.0
12	9.0	25.0	0.212615	3.0	10.0
13	22.0	27.0	0.216212	4.0	8.0
14	4.0	32.0	0.299043	4.0	19.0
15	30.0	31.0	0.306124	11.0	0.0
16	34.0	35.0	0.422040	15.0	18.0
17	7.0	33.0	0.533073	5.0	17.0
18	36.0	37.0	0.616415	20.0	16.0

[16]:

DataFrame(X)

[16]:

	0	1
0	0.548814	0.715189
1	0.602763	0.544883
2	0.423655	0.645894
3	0.437587	0.891773
4	0.963663	0.383442
5	0.791725	0.528895
6	0.568045	0.925597
7	0.071036	0.087129
8	0.020218	0.832620
9	0.778157	0.870012
10	0.978618	0.799159
11	0.461479	0.780529
12	0.118274	0.639921
13	0.143353	0.944669
14	0.521848	0.414662
15	0.264556	0.774234
16	0.456150	0.568434
17	0.018790	0.617635
18	0.612096	0.616934
19	0.943748	0.681820

[31]:

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from pyminimax import minimax, fcluster_prototype
from scipy.spatial.distance import pdist

np.random.seed(0)
X = np.random.rand(20, 2)
Z = minimax(pdist(X), return_prototype=True)

cuts = [0.1, 0.25, 0.3, 0.35, 0.6, 0.7]

fig, axs = plt.subplots(3, 2, figsize=(10, 15))

for ax, cut in zip(axs.ravel(), cuts):
    clust_proto = fcluster_prototype(Z, t=cut, criterion='distance')

    df = DataFrame(np.concatenate([X, clust_proto], axis=1), columns=['x', 'y', 'clust', 'proto'])
    sns.scatterplot(data=df, x='x', y='y', hue='clust', legend=None, ax=ax)

    ax.set(xlim=(-0.5, 1.5), ylim=(-0.5, 1.5), aspect=1, title=f'Threshold {cut}')
    protos = np.unique(df['proto'].map(int).values)
    for proto in protos:
        circle = plt.Circle(X[proto], cut, edgecolor='g', facecolor='none', clip_on=False)
        ax.add_patch(circle)

fig.tight_layout()
plt.show()

[1]:

import matplotlib.pyplot as plt
from sklearn import datasets

digits = datasets.load_digits()

_, axes = plt.subplots(nrows=1, ncols=4, figsize=(10, 3))
for ax, image, label in zip(axes, digits.images, digits.target):        # 原來 zip 裡各 list 長度可以不一致！for loop 應該是取最短的
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Training: %i' % label)

[2]:

from pyminimax import minimax, fcluster_prototype
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram
from sklearn import datasets
import matplotlib.pyplot as plt

df = datasets.load_digits(as_frame=True)['frame']

df147 = df[(df['target']==1) | (df['target']==4) | (df['target']==7)]
X = df147.drop('target', axis=1).values

[5]:

%%time

Z = minimax(pdist(X), return_prototype=True)

plt.figure(figsize=(10, 4))
dendrogram(Z[:, :4])
plt.show()

CPU times: user 35.3 s, sys: 899 ms, total: 36.2 s
Wall time: 36.5 s

[9]:

from pandas import DataFrame

DataFrame(Z[-3:, :], columns=['x', 'y', 'distance', 'n_pts', 'prototype'])

[9]:

	x	y	distance	n_pts	prototype
0	1072.0	1078.0	50.338852	182.0	135.0
1	1077.0	1080.0	54.488531	360.0	137.0
2	1079.0	1081.0	57.122675	542.0	22.0

[10]:

clust, proto = fcluster_prototype(Z, t=52, criterion='distance').T

[11]:

res = df147.assign(clust=clust, proto=proto)
df = res[['target', 'clust', 'proto']].sort_values(by='target')
df

[11]:

	target	clust	proto
1	1	3	135
517	1	3	135
527	1	3	135
537	1	3	135
1264	1	3	135
...	...	...	...
1381	7	1	341
429	7	1	341
1046	7	1	341
472	7	1	341
467	7	1	341

542 rows × 3 columns

[12]:

import seaborn as sns

sns.heatmap(df[['target', 'clust']])

[12]:

<AxesSubplot:>

[13]:

import numpy as np

protos = np.unique(df['proto'])
protos

[13]:

array([135, 341, 464], dtype=int32)

[61]:

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(10, 3))

for ax, proto in zip(axs, protos):
    ax.set_axis_off()
    image = df147.drop('target', axis=1).iloc[proto].values.reshape(8, 8)
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')

Minimax Linkage

Brute Force

Nearest-Neighbor Chain Algorithm

Performance

Demonstration

Nearest-Neighbor Chain Algorithm 