1 /* =============================================================================
5 * =============================================================================
9 * Takes as input a file:
10 * ascii file: containing 1 data point per line
11 * binary file: first int is the number of objects
12 * 2nd int is the no. of features of each object
14 * This example performs a fuzzy c-means clustering on the data. Fuzzy clustering
15 * is performed using min to max clusters and the clustering that gets the best
16 * score according to a compactness and separation criterion are returned.
22 * ECE Department Northwestern University
23 * email: wkliao@ece.northwestern.edu
29 * Northwestern University
34 * Port to Java version
36 * University of California, Irvine
38 * =============================================================================
40 * ------------------------------------------------------------------------
42 * For the license of kmeans, please see kmeans/LICENSE.kmeans
44 * ------------------------------------------------------------------------
46 * Unless otherwise noted, the following license applies to STAMP files:
48 * Copyright (c) 2007, Stanford University
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are
55 * * Redistributions of source code must retain the above copyright
56 * notice, this list of conditions and the following disclaimer.
58 * * Redistributions in binary form must reproduce the above copyright
59 * notice, this list of conditions and the following disclaimer in
60 * the documentation and/or other materials provided with the
63 * * Neither the name of Stanford University nor the names of its
64 * contributors may be used to endorse or promote products derived
65 * from this software without specific prior written permission.
67 * THIS SOFTWARE IS PROVIDED BY STANFORD UNIVERSITY ``AS IS'' AND ANY
68 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
69 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
70 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STANFORD UNIVERSITY BE LIABLE
71 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
72 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
73 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
74 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
75 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
76 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
77 * THE POSSIBILITY OF SUCH DAMAGE.
79 * =============================================================================
82 public class KMeans extends Thread {
84 * User input for max clusters
89 * User input for min clusters
94 * Check for Binary file
99 * Using zscore transformation for cluster center
100 * deviating from distribution's mean
102 int use_zscore_transform;
105 * Input file name used for clustering
110 * Total number of threads
115 * threshold until which kmeans cluster continues
125 * Global arguments for threads
130 * Output: Number of best clusters
135 * Output: Cluster centers
137 float[][] cluster_centres;
142 float[][] attributes;
152 use_zscore_transform = 1;
153 threshold = (float) 0.001;
158 public KMeans(int threadid, GlobalArgs g_args) {
159 this.threadid = threadid;
160 this.g_args = g_args;
164 public KMeans(int threadid, GlobalArgs g_args, int nthreads, int use_zscore_transform,
165 int max_nclusters, int min_nclusters, float threshold,
166 float[][] attributes, int numObjects, int numAttributes) {
167 this.threadid = threadid;
168 this.g_args = g_args;
169 //this.nthreads = nthreads;
170 //this.use_zscore_transform = use_zscore_transform;
171 //this.max_nclusters = max_nclusters;
172 //this.min_nclusters = min_nclusters;
173 //this.threshold = threshold;
174 //this.attributes = attributes;
175 //this.numObjects = numObjects;
176 //this.numAttributes = numAttributes;
180 Barrier barr = new Barrier("128.195.136.162");
182 GlobalArgs tmp_g_args;
186 short[] offsets = new short[4];
187 offsets[0] = getoffset{GlobalArgs, clusters};
188 offsets[1] = (short) 0;
189 offsets[2] = (short) 0;
190 offsets[3] = (short) min_nclusters;
191 System.rangePrefetch(g_args, offsets);
193 //Add Manual prefetch for
194 //args.clusters[0-> nclusters]
196 Barrier.enterBarrier(barr);
197 Normal.work(id, tmp_g_args);
198 Barrier.enterBarrier(barr);
202 /* =============================================================================
204 * =============================================================================
206 public static void main(String[] args) {
208 int MAX_LINE_LENGTH = 1000000; /* max input is 400000 one digit input + spaces */
209 int[] mid = new int[8];
210 mid[0] = (128<<24)|(195<<16)|(136<<8)|162; //dc-1.calit2
211 mid[1] = (128<<24)|(195<<16)|(136<<8)|163; //dc-2.calit2
212 mid[2] = (128<<24)|(195<<16)|(136<<8)|164; //dc-3.calit2
213 mid[3] = (128<<24)|(195<<16)|(136<<8)|165; //dc-4.calit2
214 mid[4] = (128<<24)|(195<<16)|(136<<8)|166; //dc-5.calit2
215 mid[5] = (128<<24)|(195<<16)|(136<<8)|167; //dc-6.calit2
216 mid[6] = (128<<24)|(195<<16)|(136<<8)|168; //dc-7.calit2
217 mid[7] = (128<<24)|(195<<16)|(136<<8)|169; //dc-8.calit2
221 * Read options fron the command prompt
225 KMeans.parseCmdLine(args, kms);
226 nthreads = kms.nthreads;
227 System.out.println("nthreads= " + kms.nthreads);
229 /* Initiate Barriers */
230 BarrierServer mybarr;
233 mybarr = global new BarrierServer(nthreads);
235 mybarr.start(mid[0]);
238 if (kms.max_nclusters < kms.min_nclusters) {
239 System.out.println("Error: max_clusters must be >= min_clusters\n");
244 float[][] attributes;
245 int numAttributes = 0;
249 * From the input file, get the numAttributes (columns in txt file) and numObjects (rows in txt file)
251 if (kms.isBinaryFile == 1) {
252 System.out.println("TODO: Unimplemented Binary file option\n");
256 FileInputStream inputFile = new FileInputStream(kms.filename);
257 byte b[] = new byte[MAX_LINE_LENGTH];
259 while ((n = inputFile.read(b)) != 0) {
260 for (int i = 0; i < n; i++) {
266 inputFile = new FileInputStream(kms.filename);
268 if((line = inputFile.readLine()) != null) {
270 boolean prevWhiteSpace = true;
271 while(index < line.length()) {
272 char c = line.charAt(index++);
273 boolean currWhiteSpace = Character.isWhitespace(c);
274 if(prevWhiteSpace && !currWhiteSpace){
277 prevWhiteSpace = currWhiteSpace;
282 /* Ignore the first attribute: numAttributes = 1; */
283 numAttributes = numAttributes - 1;
285 /* Allocate new shared objects and read attributes of all objects */
288 buf = global new float[numObjects][numAttributes];
289 attributes = global new float[numObjects][numAttributes];
292 KMeans.readFromFile(inputFile, kms.filename, tmp_buf, MAX_LINE_LENGTH);
293 System.out.println("Finished Reading from file ......");
296 * The core of the clustering
299 int len = kms.max_nclusters - kms.min_nclusters + 1;
306 km = global new KMeans[nthreads];
307 g_args = global new GlobalArgs();
308 g_args.nthreads = nthreads;
309 for(int x = 0; x < numObjects; x++) {
310 for(int y = 0; y < numAttributes; y++) {
311 attributes[x][y] = buf[x][y];
317 /* Create and Start Threads */
318 for(int i = 1; i<nthreads; i++) {
319 km[i] = global new KMeans(i, g_args, nthreads, kms.use_zscore_transform,
320 kms.max_nclusters, kms.min_nclusters, kms.threshold, attributes, numObjects, numAttributes);
326 boolean waitfordone=true;
334 for(int i = 1; i<nthreads; i++) {
341 System.out.println("Finished Starting threads......");
343 for (int i = 0; i < nloops; i++) {
345 // Since zscore transform may perform in cluster() which modifies the
346 // contents of attributes[][], we need to re-store the originals
348 float[][] tmp_attributes;
349 GlobalArgs tmp_g_args;
351 for(int x = 0; x < numObjects; x++) {
352 for(int y = 0; y < numAttributes; y++) {
353 attributes[x][y] = buf[x][y];
356 tmp_attributes = attributes;
361 Cluster.cluster_exec(nthreads,
364 tmp_attributes, // [numObjects][numAttributes]
365 kms, //main class that holds users inputs from command prompt and output arrays that need to be filled
366 tmp_g_args); // Global arguments common to all threads
369 System.out.println("Printing output......");
370 System.out.println("Best_nclusters= " + kms.best_nclusters);
372 /* Output: the coordinates of the cluster centres */
376 for (int i = 0; i < kms.best_nclusters; i++) {
377 System.out.print(i + " ");
378 for (int j = 0; j < numAttributes; j++) {
379 System.out.print(kms.cluster_centres[i][j] + " ");
381 System.out.println("\n");
386 System.out.println("Finished......\n");
390 public static void parseCmdLine(String args[], KMeans km) {
393 while (i < args.length && args[i].startsWith("-")) {
396 if(arg.equals("-m")) {
397 if(i < args.length) {
398 km.max_nclusters = new Integer(args[i++]).intValue();
400 } else if(arg.equals("-n")) {
401 if(i < args.length) {
402 km.min_nclusters = new Integer(args[i++]).intValue();
404 } else if(arg.equals("-t")) {
405 if(i < args.length) {
406 km.threshold = (float) Double.parseDouble(args[i++]);
408 } else if(arg.equals("-i")) {
409 if(i < args.length) {
410 km.filename = args[i++];
412 } else if(arg.equals("-b")) {
413 if(i < args.length) {
414 km.isBinaryFile = new Integer(args[i++]).intValue();
416 } else if(arg.equals("-z")) {
417 km.use_zscore_transform=0;
418 } else if(arg.equals("-nthreads")) {
419 if(i < args.length) {
420 km.nthreads = new Integer(args[i++]).intValue();
422 } else if(arg.equals("-h")) {
426 if(km.nthreads == 0 || km.filename == null) {
432 * The usage routine which describes the program options.
434 public void usage() {
435 System.out.println("usage: ./kmeans -m <max_clusters> -n <min_clusters> -t <threshold> -i <filename> -nthreads <threads>\n");
436 System.out.println( " -i filename: file containing data to be clustered\n");
437 System.out.println( " -b input file is in binary format\n");
438 System.out.println( " -m max_clusters: maximum number of clusters allowed\n");
439 System.out.println( " -n min_clusters: minimum number of clusters allowed\n");
440 System.out.println( " -z : don't zscore transform data\n");
441 System.out.println( " -t threshold : threshold value\n");
442 System.out.println( " -nthreads : number of threads\n");
447 * Read attributes from the input file into an array
449 public static void readFromFile(FileInputStream inputFile, String filename, float[][] buf, int MAX_LINE_LENGTH) {
450 inputFile = new FileInputStream(filename);
454 byte b[] = new byte[MAX_LINE_LENGTH];
456 byte oldbytes[]=null;
460 while ((n = inputFile.read(b)) != 0) {
463 if (oldbytes!=null) {
474 byte newbytes[]= new byte[x+oldbytes.length];
475 boolean isnumber=false;
476 for(int ii=0;ii<oldbytes.length;ii++) {
477 if (oldbytes[ii]>='0'&&oldbytes[ii]<='9')
479 newbytes[ii]=oldbytes[ii];
481 for(int ii=0;ii<x;ii++) {
482 if (b[ii]>='0'&&b[ii]<='9')
484 newbytes[ii+oldbytes.length]=b[ii];
487 x++; //skip past space
490 buf[i][j]=(float)Double.parseDouble(new String(newbytes, 0, newbytes.length));
504 boolean isnumber=false;
506 if ((b[y]>='0')&&(b[y]<='9'))
516 //need to continue for another read
517 oldbytes=new byte[y-x];
518 for(int ii=0;ii<(y-x);ii++)
519 oldbytes[ii]=b[ii+x];
522 //otherwise x is beginning of character string, y is end
525 buf[i][j]=(float)Double.parseDouble(new String(b,x,y-x));
530 i++;//skip to next line
531 j = -1;//don't store line number
532 x=y;//skip to end of number
533 x++;//skip past return
535 x=y;//skip to end of number
536 x++;//skip past space
544 /* =============================================================================
548 * =============================================================================