/*
 * Authors: 
 * Uros Krcadinac - uros@krcadinac.com
 * Nikola Milikic - nikola.milikic@gmail.com
 * 
 * GOOD OLD AI Research Lab - http://goodoldai.org
 * Intelligent Systems (FON, University of Belgrade) - http://is.fon.rs/
 * 
 * November, 2013.
 * 
 * The method used in thios code is partially based on these very useful resources for text mining in Weka:
 * http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html
 * http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html
 * http://jmgomezhidalgo.blogspot.com.es/2013/06/baseline-sentiment-analysis-with-weka.html
 * 
 * (Note: if you plane to use the code from these sites, please use Weka 3.7. 
 * Our code is using Weka 3.6. There are some differences in terms of API.) 
 * 
 */
package rs.fon.is.weka.textmining.li;

import weka.attributeSelection.InfoGainAttributeEval;
import weka.attributeSelection.Ranker;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.bayes.NaiveBayes;
import weka.classifiers.meta.FilteredClassifier;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SerializationHelper;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.MultiFilter;
import weka.filters.supervised.attribute.AttributeSelection;
import weka.filters.unsupervised.attribute.StringToWordVector;

public class LanguageIndentification {

	/*
	 * Weka classifier for text classification  
	 */
	private FilteredClassifier filteredClassifier;

	/*
	 * Weka classifier serialized as a separate file in the folder "classifiers" 
	 */
	private String classifierPath;

	/*
	 * Path to the ARFF file with the initial data 
	 */
	private String arffFileName;

	/*
	 * This constructor either:
	 * (1) loads a classifier from a file (if there is an appropriate file in the "classifiers" folder); or
	 * (2) builds a classifier from scratch (if there is no such file).
	 */
	public LanguageIndentification(String classifierPath, String arffFileName) {
		this.classifierPath = classifierPath;
		this.arffFileName = arffFileName;

		loadOrBuildClassifier();
	}

	/*
	 * If there is no classifier file, the exception will be caught in the first catch block,
	 * and the new classifier will be built and serialized in a file.  	 
	 */
	public Classifier loadOrBuildClassifier() {
		try {
			filteredClassifier = (FilteredClassifier) SerializationHelper.read(classifierPath);
		} catch (Exception e) {
			try {
				buildClassifier();
			} catch (Exception e1) {
				e1.printStackTrace();
			}
		}
		return filteredClassifier;
	}

	/*
	 * The most important method for building a classifier.
	 */
	public void buildClassifier() throws Exception {

		/*
		 * Loading training data 
		 */
		DataSource loader = new DataSource(arffFileName); 
		Instances trainingData = loader.getDataSet();

		/*
		 *  Setting the index of the class attribute. 
		 *  In our case, the first attribute "language_class" is the class attribute. 
		 */
		trainingData.setClassIndex(0);

		/*
		 *  Creating StringToWordVector filter, the most important text mining function in WEKA.
		 *  Converting String attributes (original text data) into a set of attributes 
		 *  representing word occurrence (depending on the tokenizer) information 
		 *  from the text contained in the strings. The set of words (attributes) 
		 *  is determined by the first batch filtered (typically training data).
		 */
		StringToWordVector textToWordfilter = new StringToWordVector();
		
		/*
		 *  WEKA provides several tokenizers, intended to break the original texts into tokens 
		 *  according to a number of rules. The most simple tokenizer is the weka.core.tokenizers.WordTokenizer, 
		 *  which splits the string into tokens by using a list of separators 
		 *  that can be set by clicking on the tokenizer name. 
		 *  It is a nice idea to give a look at the texts we have before setting up 
		 *  the list of separating characters. In our case, we will use the default list
		 *  of separating characters: \r\n\t\.,;:'"()?!
		 */
		textToWordfilter.setTokenizer(new WordTokenizer());
		
		/*
		 * Adding training data.
		 */
		textToWordfilter.setInputFormat(trainingData);
		
		/*
		 * We set this option to keep as much words as we can, 
		 * to include the full vocabulary of the dataset. 
		 * You can change this value to, lets say, 1 million,
		 * but it is likely that the training process will last much longer. 
		 */
		textToWordfilter.setWordsToKeep(10000);
		
		/*
		 * We set this option to True in order to make the filter collect word tokens 
		 * over the classes as a whole. This should be the standard setting 
		 * in nearly all text classification problems.
		 */
		textToWordfilter.setDoNotOperateOnPerClassBasis(true);
		
		/*
		 *  We set this option to True because we are interested on the words 
		 *  independently of using upper or lower case. 
		 *  In other problems, like e.g. when processing Social Networks text, 
		 *  keeping the capitalization may be critical for getting a good accuracy.
		 */
		textToWordfilter.setLowerCaseTokens(true);

				
		/*
		 * Before we create AttributeSelection filter, we need to set the Ranker algorithm.
		 * We set the threshold for keeping attributes as 0.0 in the weka.attributeSelection.Ranker 
		 * search method. This means that we will keep only those attributes with 
		 * Information Gain score over 0, and they will be sorted according to their score as well.		 *  
		 */
		Ranker ranker = new Ranker();
		ranker.setThreshold(0.0);

		/*
		 * Most of the tokens (words) we received as an output of the StringToWordFilter
		 * will be useless for Language Identification. Thus we make a more precise analysis 
		 * of the tokens by using Attribute Selection in conjunction with 
		 * some kind of quality metric, like Information Gain. This is the second filter!
		 */
		AttributeSelection asFilter = new AttributeSelection();
		
		/*
		 * We use the Information Gain algorithm for evaluating tokens.
		 * More info: http://en.wikipedia.org/wiki/Information_gain_in_decision_trees
		 */
		asFilter.setEvaluator(new InfoGainAttributeEval());
		
		/*
		 * Setting the ranker with the 0.0 treshold.
		 */
		asFilter.setSearch(ranker);
		
		
		/*
		 * Creating MultiFilter, because we use two filters:
		 * 1. StringToWordFilter, and
		 * 2. AttributeSeletionFilter.
		 */
		Filter[] filters = new Filter[2];
		filters[0] = textToWordfilter;
		filters[1] = asFilter;

		MultiFilter multiFilter = new MultiFilter();
		multiFilter.setFilters(filters);

		/*
		 * Now we are creating the classifier based on these filtered data.
		 * In this code, you may choose between the J48 classifier and the Naive Bayes classifier. 
		 * There are also
		 */

		/*
		 * If you want to use the J48 classifier, uncomment the following line:
		 */
		//	J48 classifier = new J48();

		/*
		 * Creating the Naive Bayes classifier
		 */
		NaiveBayes classifier = new NaiveBayes();
		
		/*
		 * Setting the supervised discretization to true, because our attributes (tokens)
		 * have numeric values, which need to be discretized.
		 */
		classifier.setUseSupervisedDiscretization(true);
		
		filteredClassifier = new FilteredClassifier();
		filteredClassifier.setClassifier(classifier);
		
		/*
		 * Adding the filter.
		 */
		filteredClassifier.setFilter(multiFilter);
		
		/*
		 * Adding data.
		 */
		filteredClassifier.buildClassifier(trainingData);

		/*
		 * Serializing filtered classifier into a file in the "classifiers" folder
		 */
		SerializationHelper.write(classifierPath, filteredClassifier);

		/*
		 * Evaluating the classifier on the training set.
		 * You may try creating a distinct test set and evaluating the classifier on it.
		 */
		Evaluation eval = new Evaluation(trainingData); 
		eval.evaluateModel(filteredClassifier, trainingData);
		
		/*
		 * Printing the evaluation summary with the confusion matrix 
		 */
		// System.out.println(filteredClassifier);
		System.out.println(eval.toSummaryString()); 
		System.out.println(eval.toMatrixString());   
	}
	
	/*
	 * A method for classifying a particular text
	 */
	public void checkTextInstace(String text) throws Exception {
		
		/*
		 * We create a new instance of the data having the same attributes as the original training dataset.
		 * We use the class FastVector in order to create a vector of possible values for
		 * the "laguage_class" attribute: EN, FR, and SP.
		 */
		FastVector languageClasses = new FastVector(3);
		languageClasses.addElement("EN");
		languageClasses.addElement("FR");
		languageClasses.addElement("SP");
		
		/*
		 * Creating the attributes separately.
		 */
		Attribute languageAttr = new Attribute("language_class", languageClasses);
		Attribute textAttr = new Attribute("text", (FastVector) null);

		/*
		 * Now we create a Fast Vector of these attributes.
		 */
		FastVector attributes = new FastVector();
		attributes.addElement(languageAttr);
		attributes.addElement(textAttr);

		/*
		 * Creating a new data instance in order to be checked with the classifier.
		 * Arguments include: the name of the new dataset ("test data"), the attributes themselves,
		 * and the data length. Since we have only one sentence, only one text instance, we set the
		 * length to 1.
		 */
		Instances data = new Instances("test data", attributes, 1);
		
		/*
		 * Setting the index of the class attribute. 
		 * In our case, the first attribute "language_class" is the class attribute.
		 * Important: The order of attributes in this dataset need to be the same as the 
		 * order of attributes in the training dataset!
		 */
		data.setClassIndex(0);

		/*
		 * Creating the particular instance in order to be put in the new dataset of Instances.
		 * We use the text provided as an argument to this method. 
		 */
		Instance instance = new Instance(2);
		instance.setDataset(data);
		instance.setValue(textAttr, text);

		/*
		 * We add this one instance to our new dataset
		 */
		data.add(instance);

		/*
		 * The result of the classification process is the number of the predicted attribute value:
		 * 0 for EN, 1 for FR, 2 for SP. 
		 * Then we need to covert it to Integer and get its class atribute value, "EN", "FR", or "SP".
		 * That is what we will print.  
		 */
		double pred = filteredClassifier.classifyInstance(instance);
		System.out.println("Class predicted: " + data.classAttribute().value((int) pred));
	}

}
