package rs.fon.is.weka.clustering;

import java.text.DecimalFormat;
import java.util.Enumeration;

import weka.clusterers.ClusterEvaluation;
import weka.clusterers.EM;
import weka.clusterers.FilteredClusterer;
import weka.core.Attribute;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.unsupervised.attribute.Remove;

public class EMClusteringExample {

	private static String fileName = "data/wholesale_horeca_customers.csv";

	public static void main(String[] args) throws Exception {

		// load the data from a CSV file
		DataSource loader = new DataSource(fileName);
		Instances data = loader.getDataSet();
		
		// check the attributes in the dataset
		System.out.println("Attributes in the dataset:");
        printAttributeData(data);
				
		// create a filter to be used to specify which attributes not to take
		// for clustering; we are not applying this filter right away;
        // instead, we will use it to create a FilteredClusterer;
        // that way the original dataset will be filtered just for the clustering, 
        // and otherwise will remain intact
		Remove removeFilter = new Remove();
		removeFilter.setAttributeIndicesArray(new int[] {0,1});  // removing nominal attributes
		removeFilter.setInputFormat(data);
		
		
		// create Expectation Maximization (EM) clustering model
        // set the same number of clusters that proved as good solution in kMeans
        int k = 4;
        // EM relies on random number generation, so, we need to set the seed for the
        // random number generator, to always initialize the generation process in the same way
        int seed = 16;  
		EM emClusterer = new EM();
		emClusterer.setSeed(seed);
		emClusterer.setNumClusters(k);
		emClusterer.setMaxIterations(30);
		
		
		// create a FilteredClusterer
		FilteredClusterer filteredClust = new FilteredClusterer();
		filteredClust.setClusterer(emClusterer);
		filteredClust.setFilter(removeFilter);
		filteredClust.buildClusterer(data);
		
		// evaluate the clustering model
		ClusterEvaluation eval = new ClusterEvaluation();
		eval.setClusterer(filteredClust);
		eval.evaluateClusterer(data);
		
		// print clustering results
		System.out.println("\nClustering results\n");
		System.out.println(eval.clusterResultsToString());
		
		// print cluster assignments for the first 10 instances
		// this is to see the probabilistic nature of the EM clusterer
		System.out.println("\nCluster membership probabilities of the first 10 instances");
		System.out.println("Cluster 1\tCluster 2\tCluster 3\tCluster 4\tAssigned to");
		System.out.println("------------------------------------------------------------------------------------");
		for (int i = 0; i < 10; i++) {	
			int clust = filteredClust.clusterInstance(data.get(i));
			double[] clustDist = filteredClust.distributionForInstance(data.get(i));
			StringBuffer sb = new StringBuffer();
			for (int j = 0; j < clustDist.length; j++) {
				sb.append(new DecimalFormat().format(clustDist[j]) + "\t\t");
			}
			sb.append("Cluster " + (clust+1));
			System.out.println( sb.toString() );
		}
		
	}
	
	private static void printAttributeData(Instances dataset) {	
		Enumeration<Attribute> attributes = dataset.enumerateAttributes();
		while (attributes.hasMoreElements()) {
			Attribute a = (Attribute) attributes.nextElement();
			System.out.println("- " + a.name() + ": " + Attribute.typeToString(a));
		}
	}
}
