/**********************************************
localsearch.cxx
Copyright 2001 by Adam Meyerson and Liadan O'Callaghan
This code is provided for examination by anyone who may be interested in
implementing local search clustering algorithms; it is not intended to be
run, but to be used as a starting point for anyone who wants to write
his or her own implementation.
In particular: We, the authors, do not provide a license for the use
of this code. If you use this code instead of writing your own, we
don't want to know. We do not claim that it works, and will not
maintain it or provide support to those who use it. Use of this code
is at the risk of the user.
*****
Paper reference:
Streaming-Data Algorithms for High-Quality Clustering,
by Liadan O'Callaghan, Nina Mishra, Adam Meyerson,
Sudipto Guha, and Rajeev Motwani,
ICDE 2002.
*****
This local search code computes a continuous k-median clustering of a
given point set, where k is within a user-specified range.
To execute:
localsearch k1 k2 n d infile samplesize outfile
k1 and k2 are lower and upper bounds on the number of medians you want
to find.
n is the number of points in infile, the file of points to cluster.
d is the dimensionality of the points in infile, which will be taken as
points in real space with the L2 norm.
If samplesize is set to some s > 0, a sample of s points will be taken
uniformly at random with replacement from infile, and the clustering will
be computed just on this sample. Setting samplesize to 0 or less will
result in a clustering based on all n points.
infile must contain one point per line, with each point given in the
format:
w x1 x2 x3 ... xd
where w is the weight of the point and its d coordinates are x1 through
xd.
outfile will be created if it does not exist, and overwritten otherwise.
It will contain the medians found by localsearch, and it will be in the
same format as infile: the weight of a median will correspond to the total
weight of all points assigned to it.
There must be a file called 'seeds' in the directory in which localsearch
is running; this file must contain an integer. localsearch will use this
integer to seed its random number generator, and will replace it with a
new integer each time it is run.
***********************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define MAXNAMESIZE 1024 // max filename length
#define SEED 1
/* increase this to reduce probability of random error */
/* increasing it also ups running time of "speedy" part of the code */
/* SP = 1 seems to be fine */
#define SP 1 // number of repetitions of speedy must be >=1
/* higher ITER --> more likely to get correct # of centers */
/* higher ITER also scales the running time almost linearly */
#define ITER 3 // iterate ITER* k log k times; ITER >= 1
/* this structure represents a point */
/* these will be passed around to avoid copying coordinates */
typedef struct {
float weight;
float *coord;
long assign; /* number of point where this one is assigned */
float cost; /* cost of that assignment, weight*distance */
} Point;
/* this is the array of points */
typedef struct {
long num; /* number of points; may not be N if this is a sample */
int dim; /* dimensionality */
Point *p; /* the array itself */
} Points;
void sample(Points *points, Points *sampl, long size);
void sample2(Points *points, long size);
int isIdentical(float *i, float *j, int D)
// tells whether two points of D dimensions are identical
{
int a = 0;
int equal = 1;
while (equal && a < D) {
if (i[a] != j[a]) equal = 0;
else a++;
}
if (equal) return 1;
else return 0;
}
void getSeed(int *s)
{
/* read in random seed from file called "seeds" */
FILE *fpSeeds;
fpSeeds = fopen("seeds", "r");
if (fpSeeds == NULL) {
cerr << "CREATE A FILE CALLED seeds CONTAINING AN INTEGER!\n";
exit(1);
}
fscanf(fpSeeds, "%d", s);
fclose(fpSeeds);
// cout << "Using Random Seed: " << *s << endl;
srand48(*s);
fpSeeds = fopen("seeds", "w");
fprintf(fpSeeds, "%d\n", lrand48());
fclose(fpSeeds);
}
/* read the points from filename into the array points */
void readpoints(char *filename, Points *points)
{
long i, ii;
ifstream ifile;
ifile.open(filename);
points->p = (Point *)malloc(points->num*sizeof(Point));
for(i=0;inum;i++) {
points->p[i].coord = (float *)malloc(points->dim*sizeof(float));
ifile >> points->p[i].weight;
for (ii=0;iidim;ii++)
ifile >> points->p[i].coord[ii];
}
}
/* read the command line to get kmin, kmax, point array */
int readcommands(int argc, char **argv, long *kmin, long *kmax,
Points *points, Points *sampl, long *samplesize,
char **outfile)
{
long i;
if (argc<8) {
cerr << "usage: " << argv[0] << " k1 k2 n d infile samplesize outfile"
<< endl;
cerr << "where n = # of points, d = # of dimensions;" << endl;
cerr << "(note k1 < k2)" << endl;
cerr << "Sampling is with replacement whenever samplesize > 0." << endl;
cerr << "If samplesize <= 0, algorithm is run on whole dataset." << endl;
return(0);
}
*kmin = atoi(argv[1]);
*kmax = atoi(argv[2]);
strcpy(*outfile, argv[7]);
points->num = atoi(argv[3]);
points->dim = atoi(argv[4]);
readpoints(argv[5], points);
*samplesize = atoi(argv[6]);
if (*samplesize > 0) {
/* this creates the Points set sampl, frees the memory for points */
sample2(points, *samplesize);
}
return(1);
}
/* build a sample of given size, points drawn uniformly at random */
/* with replacement */
void sample(Points *points, Points *sampl, long size)
{
long i, j;
long randpoint;
sampl->num = size;
sampl->dim = points->dim;
sampl->p = (Point *)malloc(sizeof(Point)*size);
for( i = 0; i < size; i++ ) {
sampl->p[i].coord = (float *)malloc(sampl->dim * sizeof(float));
}
for ( i = 0; i < size; i++ ) {
// pick a point from points, uniformly at random
randpoint = lrand48() % points->num;
// copy that point into the ith location in the sample
for ( j = 0; j < sampl->dim; j++ ) {
sampl->p[i].coord[j] = points->p[randpoint].coord[j];
}
sampl->p[i].weight = points->p[randpoint].weight;
}
for ( i = 0; i < points->num; i++ ) {
free(points->p[i].coord);
}
free(points->p);
points->p = sampl->p;
points->num = sampl->num;
}
/* comparator for floating point numbers */
static int floatcomp(const void *i, const void *j)
{
float a, b;
a = *(float *)(i);
b = *(float *)(j);
if (a > b) return (1);
if (a < b) return (-1);
return(0);
}
/* select a sample with total WEIGHT equal to size */
/* this is sampling of size points with replacement */
/* multiple selections increase the effective weight of a point */
/* thus, the total number of points in the sample is <= size */
/* this procedure will free old points structure while running */
void sample2(Points *points, long size)
{
long i, ii, iii;
float totalweight;
float *weightarray;
float mytotal;
Points *sampl;
sampl = (Points *)malloc(sizeof(Points));
totalweight=0.0;
weightarray = (float *)malloc(sizeof(float)*size);
sampl->p=(Point *)malloc(sizeof(Point)*size);
for (i=0;inum;i++)
totalweight += points->p[i].weight;
/* determine random weights between 0 and totalweight */
for (i=0;i totalweight)
fprintf(stderr, "Random Number Error!\n");
}
/* sort these weights from smallest to largest */
qsort(weightarray, size, sizeof(float), floatcomp);
mytotal=0.0; ii=0; iii=0;
/* look through all the points, deciding how many times to sample them */
for (i=0;inum;i++) {
mytotal += points->p[i].weight;
/* check whether current point was sampled */
if ((ii weightarray[ii])) {
sampl->p[iii].weight = 1.0;
sampl->p[iii].coord = points->p[i].coord;
ii++;
/* check if this point was chosen again due to high weight */
while ((ii < size)&&(mytotal > weightarray[ii])) {
sampl->p[iii].weight += 1.0;
ii++;
}
iii++;
}
/* this point will not appear in the sample */
else {
free(points->p[i].coord);
}
}
sampl->num = iii;
sampl->dim = points->dim;
free(points->p);
/* copy sampl over to points */
points->num = sampl->num;
points->dim = sampl->dim;
points->p = sampl->p;
}
/* shuffle points into random order */
void shuffle(Points *points)
{
long i, j;
Point temp;
for (i=0;inum-1;i++) {
j=(lrand48()%(points->num - i)) + i;
temp = points->p[i];
points->p[i] = points->p[j];
points->p[j] = temp;
}
}
/* shuffle an array of integers */
void intshuffle(int *intarray, int length)
{
long i, j;
int temp;
for (i=0;inum);
/* create center at first point, send it to itself */
centers[0] = 0; *k=1;
points->p[0].assign=0; points->p[0].cost = 0.0;
totalcost = z;
for (i=1;inum;i++) {
/* find closest center */
closest=0; mindist=dist(points->p[i],points->p[0],points->dim);
for (ii=1;ii<*k;ii++) {
distance=dist(points->p[i], points->p[centers[ii]],points->dim);
if (distance < mindist) {
mindist=distance; closest=ii;
}
}
/* decide whether to open new facility */
if (((float)lrand48()/(float)INT_MAX)<(mindist*points->p[i].weight/z)) {
centers[(*k)++] = i;
points->p[i].assign=i; points->p[i].cost = 0.0;
totalcost+=z;
}
else {
points->p[i].assign=centers[closest];
points->p[i].cost = mindist*points->p[i].weight;
totalcost += points->p[i].cost;
}
}
#ifdef PRINTINFO
fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n",
*k, totalcost);
fprintf(stderr, "Distance Cost %lf\n", totalcost - z*(*k));
#endif
free(centers);
return(totalcost);
}
/* run speedy on the points, return total cost of solution */
float speedy2(Points *points, float z, long *k, long klim)
{
long i, ii, closest;
long *centers;
float mindist, distance;
float totalcost;
#ifdef PRINTINFO
fprintf(stderr, "Speedy: facility cost %lf\n", z);
#endif
centers=(long *)malloc(sizeof(long)*points->num);
/* create center at first point, send it to itself */
centers[0] = 0; *k=1;
points->p[0].assign=0; points->p[0].cost = 0.0;
totalcost = z;
for (i=1;inum;i++) {
/* find closest center */
closest=0; mindist=dist(points->p[i],points->p[0],points->dim);
for (ii=1;ii<*k;ii++) {
distance=dist(points->p[i], points->p[centers[ii]],points->dim);
if (distance < mindist) {
mindist=distance; closest=ii;
}
}
/* decide whether to open new facility */
if (((float)lrand48()/(float)INT_MAX)<(mindist*points->p[i].weight/z)) {
centers[(*k)++] = i;
points->p[i].assign=i; points->p[i].cost = 0.0;
totalcost+=z;
}
else {
points->p[i].assign=centers[closest];
points->p[i].cost = mindist*points->p[i].weight;
totalcost += points->p[i].cost;
}
if (*k > klim) {
free(centers);
return(totalcost);
}
}
#ifdef PRINTINFO
fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n",
*k, totalcost);
fprintf(stderr, "Distance Cost %lf\n", totalcost - z*(*k));
#endif
free(centers);
return(totalcost);
}
/* For a given point x, find the cost of the following operation:
* -- open a facility at x if there isn't already one there,
* -- for points y such that the assignment distance of y exceeds dist(y, x),
* make y a member of x,
* -- for facilities y such that reassigning y and all its members to x
* would save cost, realize this closing and reassignment.
*
* If the cost of this operation is negative (i.e., if this entire operation
* saves cost), perform this operation and return the amount of cost saved;
* otherwise, do nothing.
*/
/* numcenters will be updated to reflect the new number of centers */
/* z is the facility cost, x is the number of this point in the array
points */
float gain(long x, Points *points, float z, long *numcenters)
{
int i;
int number_of_points = points->num;
int number_of_centers_to_close = 0;
// for each point y, will y switch membership to x?
int *switch_membership = (int *) malloc(sizeof(int) * number_of_points);
for ( i = 0; i < number_of_points; i++ ) {
switch_membership[i] = 0;
}
// for each point y, if y is a median, how much will it save by closing
// and shifting itself and its members to x?
float *lower = (float *) malloc(sizeof(int) * number_of_points);
for ( i = 0; i < number_of_points; i++ ) {
lower[i] = 0;
}
// We will calculate the cost of opening a median at x
float cost_of_opening_x;
if ( points->p[x].assign == x ) {
// x is already a median; opening it doesn't cost an additional z
cost_of_opening_x = 0;
}
else {
cost_of_opening_x = z;
}
// every median would save ("lower" its cost by) z if it could close
// and reassign itself and its members to x
for ( i = 0; i < number_of_points; i++ ) {
lower[points->p[i].assign] = z;
}
for ( i = 0; i < number_of_points; i++ ) {
float x_cost = dist(points->p[i], points->p[x], points->dim)
* points->p[i].weight;
float current_cost =
dist(points->p[i], points->p[points->p[i].assign], points->dim)
* points->p[i].weight;
if ( x_cost < current_cost ) {
// point i would save cost just by switching to x
// (note that i cannot be a median,
// or else dist(p[i], p[x]) would be 0)
switch_membership[i] = 1;
cost_of_opening_x += x_cost - current_cost;
} else {
// cost of assigning i to x is at least current assignment cost of i
// consider the savings that i's **current** median would realize
// if we reassigned that median and all its members to x;
// note we've already accounted for the fact that the median
// would save z by closing; now we have to subtract from the savings
// the extra cost of reassigning that median and its members
lower[points->p[i].assign] += current_cost - x_cost;
}
}
// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for ( i = 0; i < number_of_points; i++ ) {
if ( lower[i] > 0 ) {
// i is a median, and
// if we were to open x (which we still may not) we'd close i
// note, we'll ignore the following quantity unless we do open x
++number_of_centers_to_close;
cost_of_opening_x -= lower[i];
}
}
// now, check whether opening x would save cost; if so, do it,
// and otherwise do nothing
if ( cost_of_opening_x < 0 ) {
// we'd save money by opening x; we'll do it
for ( i = 0; i < number_of_points; i++ ) {
if ( switch_membership[i] || lower[points->p[i].assign] > 0 ) {
// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
points->p[i].cost = points->p[i].weight *
dist(points->p[i], points->p[x], points->dim);
points->p[i].assign = x;
}
}
*numcenters = *numcenters + 1 - number_of_centers_to_close;
}
else {
cost_of_opening_x = 0; // the value we'll return
}
free(switch_membership);
free(lower);
return -cost_of_opening_x;
}
/* facility location on the points using local search */
/* z is the facility cost, returns the total cost and # of centers */
/* assumes we are seeded with a reasonable solution */
/* cost should represent this solution's cost */
/* halt if there is < e improvement after iter calls to gain */
/* feasible is an array of numfeasible points which may be centers */
float FL(Points *points, int *feasible, int numfeasible,
float z, long *k, double cost, long iter, float e)
{
long i;
long x;
double change;
long numberOfPoints;
#ifdef PRINTINFO
fprintf(stderr, "%d centers, cost %lf, total distance %lf\n",
*k, cost, cost - z*(*k));
#endif
change = -cost;
/* continue until we run iter iterations without improvement */
/* stop instead if improvement is less than e */
while (change/cost < -1.0*e) {
change = 0.0;
numberOfPoints = points->num;
/* randomize order in which centers are considered */
intshuffle(feasible, numfeasible);
for (i=0;i= numfeasible)
x = i%numfeasible;
else
x = lrand48()%numfeasible;
change += gain(feasible[x], points, z, k);
}
cost += change;
#ifdef PRINTINFO
fprintf(stderr, "%d centers, cost %lf, total distance %lf\n",
*k, cost, cost - z*(*k));
#endif
}
return(cost);
}
/* set feasible to be an array of feasible centers */
/* return number of feasible centers */
int selectfeasible(Points *points, int **feasible, int kmin)
{
int i, ii;
int numfeasible;
float *feasweight;
float totalweight;
float mytotal;
numfeasible = points->num;
if (numfeasible > (ITER*kmin*log(kmin)))
numfeasible = (int)(ITER*kmin*log(kmin));
*feasible = (int *)malloc(numfeasible*sizeof(int));
/* not many points, all will be feasible */
if (numfeasible == points->num) {
for (i=0;inum;i++)
(*feasible)[i] = i;
}
else {
/* choose feasible points at random */
totalweight = 0.0;
for (i=0;inum;i++)
totalweight+=points->p[i].weight;
feasweight = (float *)malloc(numfeasible*sizeof(float));
for (i=0;inum;i++) {
mytotal+=points->p[i].weight;
while ((ii < numfeasible)&&(mytotal > feasweight[ii]))
(*feasible)[ii++] = i;
}
if (ii != numfeasible) fprintf(stderr, "Error in Feasible Centers\n");
free(feasweight);
}
return(numfeasible);
}
/* compute approximate kmedian on the points */
float kmedian(Points *points, long kmin, long kmax)
{
int i;
float cost;
float lastcost;
float hiz, loz, z;
long k;
int flag;
int *feasible;
int numfeasible;
hiz = loz = 0.0;
long numberOfPoints = points->num;
long ptDimension = points->dim;
#ifdef PRINTINFO
fprintf(stderr, "Starting Kmedian procedure\n");
fprintf(stderr, "%d points in %d dimensions\n", numberOfPoints, ptDimension);
#endif
for (i=0;ip[i], points->p[0],
ptDimension)*points->p[i].weight;
loz=0.0; z = (hiz+loz)/2.0;
shuffle(points);
cost = speedy2(points, 0.001, &k, kmax);
if (k <= kmax) {
return(cost);
}
shuffle(points);
cost = speedy(points, z, &k);
#ifdef PRINTINFO
fprintf(stderr, "Finished first call to speedy\n");
#endif
i=0;
/* NEW: Check whether more centers than points! */
if (points->num <= kmax) {
/* just return all points as facilities */
for (i=0;inum;i++) {
points->p[i].assign = i;
points->p[i].cost = 0;
return(0.0);
}
}
/* give speedy SP chances to get at least kmin/2 facilities */
while ((k < kmin)&&(i= SP) {hiz=z; z=(hiz+loz)/2.0; i=0;}
shuffle(points);
cost = speedy(points, z, &k);
i++;
}
/* now we begin the binary search for real */
/* must designate some points as feasible centers */
/* this creates more consistancy between FL runs */
/* helps to guarantee correct # of centers at the end */
numfeasible = selectfeasible(points, &feasible, kmin);
while(1) {
#ifdef PRINTINFO
fprintf(stderr, "loz = %lf, hiz = %lf\n", loz, hiz);
fprintf(stderr, "Running Local Search...\n");
#endif
/* first get a rough estimate on the FL solution */
lastcost = cost;
cost = FL(points, feasible, numfeasible,
z, &k, cost, (long)(ITER*kmax*log(kmax)), 0.1);
/* if number of centers seems good, try a more accurate FL */
if (((k <= (1.1)*kmax)&&(k >= (0.9)*kmin))||
((k <= kmax+2)&&(k >= kmin-2))) {
#ifdef PRINTINFO
fprintf(stderr, "Trying a more accurate local search...\n");
#endif
/* may need to run a little longer here before halting without
improvement */
cost = FL(points, feasible, numfeasible,
z, &k, cost, (long)(ITER*kmax*log(kmax)), 0.001);
}
if (k > kmax) {
/* facilities too cheap */
/* increase facility cost and up the cost accordingly */
loz = z; z = (hiz+loz)/2.0;
cost += (z-loz)*k;
}
if (k < kmin) {
/* facilities too expensive */
/* decrease facility cost and reduce the cost accordingly */
hiz = z; z = (hiz+loz)/2.0;
cost += (z-hiz)*k;
}
/* if k is good, return the result */
/* if we're stuck, just give up and return what we have */
if (((k <= kmax)&&(k >= kmin))||((loz >= (0.999)*hiz)))
{ free(feasible); return(cost);}
}
}
/* compute the means for the k clusters */
float contcenters(Points *points)
{
long i, ii;
float relweight;
float cost;
for (i=0;inum;i++) {
/* compute relative weight of this point to the cluster */
if (points->p[i].assign != i) {
relweight=points->p[points->p[i].assign].weight + points->p[i].weight;
relweight = points->p[i].weight/relweight;
for (ii=0;iidim;ii++) {
points->p[points->p[i].assign].coord[ii]*=1.0-relweight;
points->p[points->p[i].assign].coord[ii]+=
points->p[i].coord[ii]*relweight;
}
points->p[points->p[i].assign].weight += points->p[i].weight;
}
}
cost = 0.0;
for (i=0;inum;i++)
cost += dist(points->p[i], points->p[points->p[i].assign],
points->dim)*points->p[i].weight;
return(cost);
}
/* print the centers to a file of a given name */
void outcenters(Points *points, char *outname)
{
long i, ii;
long k;
int *is_a_median = (int *) malloc(points->num * sizeof(int));
for ( i = 0; i < points->num; i++) {
is_a_median[i] = 0;
}
/* mark the centers */
for ( i = 0; i < points->num; i++ ) {
is_a_median[points->p[i].assign] = 1;
}
k=0;
/* count how many centers */
for ( i = 0; i < points->num; i++ ) {
if ( is_a_median[i] ) {
k++;
}
}
FILE *fpOut = fopen(outname, "w");
/* output the centers */
for ( i = 0; i < points->num; i++ ) {
if ( is_a_median[i] ) {
fprintf(fpOut, "%d", (int)points->p[i].weight);
for ( ii = 0; ii < points->dim; ii++ ) {
fprintf(fpOut, " %lf", points->p[i].coord[ii]);
}
fprintf(fpOut, "\n");
}
}
fclose(fpOut);
free(is_a_median);
}
main(int argc, char **argv)
{
char *outfilename = new char[MAXNAMESIZE];
long kmin, kmax, k, dimension, N, samplesize;
Points points, sampl;
// Note: 00.12.11: (Liadan) Note that I've changed command line:
// It's now a.out k1 k2 N d inputfile samplesize
int seed;
if (SEED)
getSeed(&seed);
if ( readcommands(argc, argv, &kmin, &kmax, &points, &sampl, &samplesize,
&outfilename) == 0 ) {
exit(0);
}
dimension = points.dim;
kmedian(&points, kmin, kmax);
contcenters(&points);
outcenters(&points, outfilename);
}