"""Python Implementation of different k-Means Seeding Algorithms
Inspiration and Algorithm from
> Fast and Provably Good Seedings for k-Means
> Olivier Bachem, Mario Lucic, S. Hamed Hassani and Andreas Krause
> In Neural Information Processing Systems (NIPS), 2016.
Usage:
>>> import afkmc2
>>> X = np.array with d-dimensional data
>>> seeds = afkmc2.afkmc2(X, 3)
Use seeds in sklearn KMeans:
>>> from sklearn.custer import KMeans
>>> model = KMeans(3, init=seeds).fit(X)
>>> print model.cluster_centers_
"""
import numpy as np
[docs]def kmpp(X, k):
"""
| KMeans++ Seeding as described by Arthur and Vassilvitskii (2007)
| Runtime :code:`O(nkd)`
:param X: Datapoints. Shape: (n, d)
:type X: np.array
:param k: Number cluster centers.
:type k: int
:return: Cluster centers for seeding. Shape: (k, d)
:rtype: np.array
:Example:
:code:`seeds = afkmc2.kmpp(X, 3)`
"""
X = X.astype(np.float64, copy=False)
centers = np.zeros((k, X.shape[1]), dtype=np.float64)
# Sample first center uniformly
centers[0, :] = X[np.random.choice(X.shape[0]), :]
for i in range(1, k):
# Get distance to closest center for each point
D2 = np.array([min([np.linalg.norm(x-centers[j, :])**2
for j in range(i)]) for x in X])
# Choose new center with high prob if far from all other centers
probs = D2/np.sum(D2)
cumprobs = probs.cumsum()
r = np.random.uniform()
ind = np.where(cumprobs >= r)[0][0]
# Store chosen center
centers[i] = X[ind, :]
return centers
[docs]def kmc2(X, k, m=200):
"""
| KMC^2 Seeding as described by Bachem, Lucic, Hassani and Krause (2016)
| Runtime :code:`O(mk^2d)`
:param X: Datapoints. Shape: (n, d)
:type X: np.array
:param k: Number cluster centers.
:type k: int
:param m: Length of Markov Chain. Default 200
:type m: int
:return: Cluster centers for seeding. Shape: (k, d)
:rtype: np.array
:Example:
:code:`seeds = afkmc2.kmc2(X, 3)`
"""
X = X.astype(np.float64, copy=False)
centers = np.zeros((k, X.shape[1]), dtype=np.float64)
# Sample first center uniformly
centers[0, :] = X[np.random.choice(X.shape[0]), :]
# k-1 iterations
for i in range(1, k):
# Sample initial point of Markov Chain
x = np.random.choice(X.shape[0])
# Get shortest distance from previous centers
dx2 = min([np.linalg.norm(X[x, :]-centers[j, :])**2 for j in range(i)])
# m-1 more candidates
for j in range(1, m):
# New Sample
y = np.random.choice(X.shape[0])
dy2 = min([np.linalg.norm(X[y, :]-centers[j, :])**2
for j in range(i)])
# Move to candidate according to acceptance prob based on distances
if dx2 == 0 or dy2/dx2 > np.random.uniform():
x = y
dx2 = dy2
# Store current choice after m samples
centers[i] = X[x, :]
return centers
[docs]def afkmc2(X, k, m=200):
"""
| AFKMC^2 Seeding as described by Bachem, Lucic, Hassani and Krause (2016)
| Runtime :code:`O(nd + mk^2d)`
:param X: Datapoints. Shape: (n, d)
:type X: np.array
:param k: Number cluster centers.
:type k: int
:param m: Length of Markov Chain. Default 200
:type m: int
:return: Cluster centers for seeding. Shape: (k, d)
:rtype: np.array
:Example:
:code:`seeds = afkmc2.afkmc2(X, 3)`
"""
X = X.astype(np.float64, copy=False)
centers = np.zeros((k, X.shape[1]), dtype=np.float64)
# Sample first center uniformly
centers[0, :] = X[np.random.choice(X.shape[0]), :]
# Create assumption free proposal distribution: O(n)
d2 = [np.linalg.norm(X[i, :]-centers[0, :])**2
for i in range(X.shape[0])]
q = d2/(2*np.sum(d2)) + 1/(2.0*X.shape[0])
# k-1 iterations
for i in range(1, k):
# Sample initial point of Markov Chain
x = np.random.choice(X.shape[0], p=q)
# Get shortest distance from previous centers
dx2 = min([np.linalg.norm(X[x, :]-centers[j, :])**2 for j in range(i)])
# m-1 more candidates
for j in range(1, m):
# New Sample
y = np.random.choice(X.shape[0], p=q)
dy2 = min([np.linalg.norm(X[y, :]-centers[j, :])**2
for j in range(i)])
# Move to candidate according to acceptance prob based on distances
if dx2*q[y] == 0 or (dy2*q[x])/(dx2*q[y]) > np.random.uniform():
x = y
dx2 = dy2
# Store current choice after m samples
centers[i] = X[x, :]
return centers
[docs]def afkmc2_c(X, k, m=200):
"""
| Cached AFKMC^2 Seeding based on AFKMC
| as described by Bachem, Lucic, Hassani and Krause (2016)
| Caching addition to prevent duplicate work for small datasets
| Additional space cost during execution :code:`O(nk)`
| Runtime :code:`O(nd + mk^2d)`
:param X: Datapoints. Shape: (n, d)
:type X: np.array
:param k: Number cluster centers.
:type k: int
:param m: Length of Markov Chain. Default 200
:type m: int
:return: Cluster centers for seeding. Shape: (k, d)
:rtype: np.array
:Example:
:code:`seeds = afkmc2.afkmc2_c(X, 3)`
"""
X = X.astype(np.float64, copy=False)
centers = np.zeros((k, X.shape[1]), dtype=np.float64)
# Sample first center uniformly
centers[0, :] = X[np.random.choice(X.shape[0]), :]
# Distance memoization
d_store = np.empty((X.shape[0], k), dtype=np.float64)
d_store.fill(-1)
def distance(i, j):
"""Get squared distance between node and one of the centers
Args:
i: Index in data array
j: Index of cluster center
Returns:
Squared Distance between X[i, :] and centers[j, :]
"""
if d_store[i, j] < 0:
d_store[i, j] = np.linalg.norm(X[i, :]-centers[j, :])**2
return d_store[i, j]
# Create assumption free proposal distribution: O(n)
d2 = [distance(i, 0) for i in range(X.shape[0])]
q = d2/(2*np.sum(d2)) + 1/(2.0*X.shape[0])
# k-1 iterations
for i in range(1, k):
# Sample initial point of Markov Chain
x = np.random.choice(X.shape[0], p=q)
# Get shortest distance from previous centers
dx2 = min([distance(x, j) for j in range(i)])
# m-1 more candidates
for j in range(1, m):
# New Sample
y = np.random.choice(X.shape[0], p=q)
dy2 = min([distance(y, j) for j in range(i)])
# Move to candidate according to acceptance prob based on distances
if dx2*q[y] == 0 or (dy2*q[x])/(dx2*q[y]) > np.random.uniform():
x = y
dx2 = dy2
# Store current choice after m samples
centers[i] = X[x, :]
return centers