Source code for afkmc2

"""Python Implementation of different k-Means Seeding Algorithms

Inspiration and Algorithm from
> Fast and Provably Good Seedings for k-Means
> Olivier Bachem, Mario Lucic, S. Hamed Hassani and Andreas Krause
> In Neural Information Processing Systems (NIPS), 2016.

Usage:
>>> import afkmc2
>>> X = np.array with d-dimensional data
>>> seeds = afkmc2.afkmc2(X, 3)

Use seeds in sklearn KMeans:
>>> from sklearn.custer import KMeans
>>> model = KMeans(3, init=seeds).fit(X)
>>> print model.cluster_centers_
"""
import numpy as np


[docs]def kmpp(X, k):
    """
    | KMeans++ Seeding as described by Arthur and Vassilvitskii (2007)
    | Runtime :code:`O(nkd)`

    :param X: Datapoints. Shape: (n, d)
    :type X: np.array
    :param k: Number cluster centers.
    :type k: int

    :return: Cluster centers for seeding. Shape: (k, d)
    :rtype: np.array

    :Example:

        :code:`seeds = afkmc2.kmpp(X, 3)`
    """
    X = X.astype(np.float64, copy=False)
    centers = np.zeros((k, X.shape[1]), dtype=np.float64)

    # Sample first center uniformly
    centers[0, :] = X[np.random.choice(X.shape[0]), :]

    for i in range(1, k):
        # Get distance to closest center for each point
        D2 = np.array([min([np.linalg.norm(x-centers[j, :])**2
                       for j in range(i)]) for x in X])

        # Choose new center with high prob if far from all other centers
        probs = D2/np.sum(D2)
        cumprobs = probs.cumsum()
        r = np.random.uniform()
        ind = np.where(cumprobs >= r)[0][0]

        # Store chosen center
        centers[i] = X[ind, :]

    return centers


[docs]def kmc2(X, k, m=200):
    """
    | KMC^2 Seeding as described by Bachem, Lucic, Hassani and Krause (2016)
    | Runtime :code:`O(mk^2d)`

    :param X: Datapoints. Shape: (n, d)
    :type X: np.array
    :param k: Number cluster centers.
    :type k: int
    :param m: Length of Markov Chain. Default 200
    :type m: int

    :return: Cluster centers for seeding. Shape: (k, d)
    :rtype: np.array

    :Example:

        :code:`seeds = afkmc2.kmc2(X, 3)`
    """
    X = X.astype(np.float64, copy=False)
    centers = np.zeros((k, X.shape[1]), dtype=np.float64)

    # Sample first center uniformly
    centers[0, :] = X[np.random.choice(X.shape[0]), :]

    # k-1 iterations
    for i in range(1, k):
        # Sample initial point of Markov Chain
        x = np.random.choice(X.shape[0])
        # Get shortest distance from previous centers
        dx2 = min([np.linalg.norm(X[x, :]-centers[j, :])**2 for j in range(i)])

        # m-1 more candidates
        for j in range(1, m):
            # New Sample
            y = np.random.choice(X.shape[0])
            dy2 = min([np.linalg.norm(X[y, :]-centers[j, :])**2
                      for j in range(i)])
            # Move to candidate according to acceptance prob based on distances
            if dx2 == 0 or dy2/dx2 > np.random.uniform():
                x = y
                dx2 = dy2

        # Store current choice after m samples
        centers[i] = X[x, :]

    return centers


[docs]def afkmc2(X, k, m=200):
    """
    | AFKMC^2 Seeding as described by Bachem, Lucic, Hassani and Krause (2016)
    | Runtime :code:`O(nd + mk^2d)`

    :param X: Datapoints. Shape: (n, d)
    :type X: np.array
    :param k: Number cluster centers.
    :type k: int
    :param m: Length of Markov Chain. Default 200
    :type m: int

    :return: Cluster centers for seeding. Shape: (k, d)
    :rtype: np.array

    :Example:

        :code:`seeds = afkmc2.afkmc2(X, 3)`
    """
    X = X.astype(np.float64, copy=False)
    centers = np.zeros((k, X.shape[1]), dtype=np.float64)

    # Sample first center uniformly
    centers[0, :] = X[np.random.choice(X.shape[0]), :]

    # Create assumption free proposal distribution: O(n)
    d2 = [np.linalg.norm(X[i, :]-centers[0, :])**2
          for i in range(X.shape[0])]
    q = d2/(2*np.sum(d2)) + 1/(2.0*X.shape[0])

    # k-1 iterations
    for i in range(1, k):
        # Sample initial point of Markov Chain
        x = np.random.choice(X.shape[0], p=q)
        # Get shortest distance from previous centers
        dx2 = min([np.linalg.norm(X[x, :]-centers[j, :])**2 for j in range(i)])

        # m-1 more candidates
        for j in range(1, m):
            # New Sample
            y = np.random.choice(X.shape[0], p=q)
            dy2 = min([np.linalg.norm(X[y, :]-centers[j, :])**2
                      for j in range(i)])
            # Move to candidate according to acceptance prob based on distances
            if dx2*q[y] == 0 or (dy2*q[x])/(dx2*q[y]) > np.random.uniform():
                x = y
                dx2 = dy2

        # Store current choice after m samples
        centers[i] = X[x, :]

    return centers


[docs]def afkmc2_c(X, k, m=200):
    """
    | Cached AFKMC^2 Seeding based on AFKMC
    | as described by Bachem, Lucic, Hassani and Krause (2016)

    | Caching addition to prevent duplicate work for small datasets
    | Additional space cost during execution :code:`O(nk)`
    | Runtime :code:`O(nd + mk^2d)`

    :param X: Datapoints. Shape: (n, d)
    :type X: np.array
    :param k: Number cluster centers.
    :type k: int
    :param m: Length of Markov Chain. Default 200
    :type m: int

    :return: Cluster centers for seeding. Shape: (k, d)
    :rtype: np.array

    :Example:

        :code:`seeds = afkmc2.afkmc2_c(X, 3)`
    """
    X = X.astype(np.float64, copy=False)
    centers = np.zeros((k, X.shape[1]), dtype=np.float64)

    # Sample first center uniformly
    centers[0, :] = X[np.random.choice(X.shape[0]), :]

    # Distance memoization
    d_store = np.empty((X.shape[0], k), dtype=np.float64)
    d_store.fill(-1)

    def distance(i, j):
        """Get squared distance between node and one of the centers

        Args:
            i: Index in data array
            j: Index of cluster center
        Returns:
            Squared Distance between X[i, :] and centers[j, :]
        """
        if d_store[i, j] < 0:
            d_store[i, j] = np.linalg.norm(X[i, :]-centers[j, :])**2
        return d_store[i, j]

    # Create assumption free proposal distribution: O(n)
    d2 = [distance(i, 0) for i in range(X.shape[0])]
    q = d2/(2*np.sum(d2)) + 1/(2.0*X.shape[0])

    # k-1 iterations
    for i in range(1, k):
        # Sample initial point of Markov Chain
        x = np.random.choice(X.shape[0], p=q)
        # Get shortest distance from previous centers
        dx2 = min([distance(x, j) for j in range(i)])

        # m-1 more candidates
        for j in range(1, m):
            # New Sample
            y = np.random.choice(X.shape[0], p=q)
            dy2 = min([distance(y, j) for j in range(i)])
            # Move to candidate according to acceptance prob based on distances
            if dx2*q[y] == 0 or (dy2*q[x])/(dx2*q[y]) > np.random.uniform():
                x = y
                dx2 = dy2

        # Store current choice after m samples
        centers[i] = X[x, :]

    return centers