/*
 * Authors: 
 * Uros Krcadinac - uros@krcadinac.com
 * Nikola Milikic - nikola.milikic@gmail.com
 * 
 * GOOD OLD AI Research Lab - http://goodoldai.org
 * Intelligent Systems (FON, University of Belgrade) - http://is.fon.rs/
 * 
 * November, 2013.
 * 
 * The method used in thios code is partially based on these very useful resources for text mining in Weka:
 * http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html
 * http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html
 * http://jmgomezhidalgo.blogspot.com.es/2013/06/baseline-sentiment-analysis-with-weka.html
 * 
 * (Note: if you plane to use the code from these sites, please use Weka 3.7. 
 * Our code is using Weka 3.6. There are some differences in terms of API.) 
 * 
 */
package rs.fon.is.weka.textmining.li;

import java.util.ArrayList;
import java.util.List;

import weka.attributeSelection.InfoGainAttributeEval;
import weka.attributeSelection.Ranker;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.bayes.NaiveBayes;
import weka.classifiers.meta.FilteredClassifier;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SelectedTag;
import weka.core.SerializationHelper;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.MultiFilter;
import weka.filters.supervised.attribute.AttributeSelection;
import weka.filters.unsupervised.attribute.ClassAssigner;
import weka.filters.unsupervised.attribute.StringToWordVector;

public class LanguageIndentification {

	/*
	 * Weka classifier for text classification  
	 */
	private FilteredClassifier filteredClassifier;

	/*
	 * Weka classifier serialized as a separate file in the folder "classifiers" 
	 */
	private String classifierPath;

	/*
	 * Path to the ARFF file with the initial data 
	 */
	private String arffFileName;

	/*
	 * This constructor either:
	 * (1) loads a classifier from a file (if there is an appropriate file in the "classifiers" folder); or
	 * (2) builds a classifier from scratch (if there is no such file).
	 */
	public LanguageIndentification(String classifierPath, String arffFileName) {
		this.classifierPath = classifierPath;
		this.arffFileName = arffFileName;

		loadOrBuildClassifier();
	}

	/*
	 * If there is no classifier file, the exception will be caught in the first catch block,
	 * and the new classifier will be built and serialized in a file.  	 
	 */
	public Classifier loadOrBuildClassifier() {
		try {
			filteredClassifier = (FilteredClassifier) SerializationHelper.read(classifierPath);
		} catch (Exception e) {
			try {
				buildClassifier();
			} catch (Exception e1) {
				e1.printStackTrace();
			}
		}
		return filteredClassifier;
	}

	/*
	 * The most important method for building a classifier.
	 */
	public void buildClassifier() throws Exception {

		/*
		 * Loading training data 
		 */
		DataSource loader = new DataSource(arffFileName); 
		Instances trainingData = loader.getDataSet();

		/*
		 *  Setting the index of the class attribute. 
		 *  In our case, the first attribute "language_class" is the class attribute. 
		 */
		trainingData.setClassIndex(0);

		/*
		 *  Creating StringToWordVector filter, the most important text mining function in WEKA.
		 *  Converting the String attribute (original text data) into a set of attributes 
		 *  representing word occurrence (depending on the tokenizer) information 
		 *  from the text contained in the strings. The set of words (attributes) 
		 *  is determined by processing the training data.
		 */
		StringToWordVector textToWordfilter = new StringToWordVector();
		
		/*
		 * set the index(es) of the attribute(s) the filter is to be applied to
		 * in our case, it is just one attribute, the second (last) one
		 */
		textToWordfilter.setAttributeIndices("last");
		// alternative: textToWordfilter.setAttributeIndicesArray(new int[] {1});
		
		/*
		 *  WEKA provides several tokenizers, intended to break the original texts into tokens 
		 *  according to a number of rules. The simplest tokenizer is the weka.core.tokenizers.WordTokenizer, 
		 *  which splits the string into tokens by using a list of separators 
		 *  that can be specified through the setDelimiters(String) method. 
		 *  It is a good idea to take a close look at the texts we have before setting up 
		 *  the list of separating characters. In our case, we will use the default list
		 *  of separating characters: \r\n\t\.,;:'"()?!
		 */
		WordTokenizer tokenizer = new WordTokenizer();
		textToWordfilter.setTokenizer(tokenizer);
		
		/*
		 * Adding training data.
		 */
		textToWordfilter.setInputFormat(trainingData);
		
		/*
		 * The number of words to keep depends on the size of the 
		 * corpus we work with.
		 */
		textToWordfilter.setWordsToKeep(5000);
		
		
		/*
		 *  We set this option to True because we are interested in the words 
		 *  independently of using upper or lower case. 
		 *  In other problems, like e.g. when processing spam messages, 
		 *  keeping the capitalization may be critical for getting a good accuracy.
		 */
		textToWordfilter.setLowerCaseTokens(true);
		
		
		/*
		 * assign a prefix to the newly created attributes so that they can be distinguished
		 * from the original attributes
		 */
		textToWordfilter.setAttributeNamePrefix("W_");
		
		/*
		 * normalize word frequencies at the document level; this is particularly relevant when 
		 * documents in the corpus vary in terms of their length (number of words)  
		 */
		textToWordfilter.setNormalizeDocLength(new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, 
																StringToWordVector.TAGS_FILTER));
		
		/*
		 * make word counts available for printing
		 */
		textToWordfilter.setOutputWordCounts(true);

	
		/*
		 * Most of the tokens (words) we received as an output of the StringToWordFilter
		 * will be useless for Language Identification. Thus we make a more precise analysis 
		 * of the tokens by using Attribute Selection in conjunction with 
		 * some kind of quality metric, like Information Gain. 
		 * This is the second filter!
		 */
		AttributeSelection asFilter = new AttributeSelection();
		
		/*
		 * We use the Information Gain algorithm for evaluating tokens.
		 * More info: http://en.wikipedia.org/wiki/Information_gain_in_decision_trees
		 */
		asFilter.setEvaluator(new InfoGainAttributeEval());
		
		/*
		 * We also need to specify how we will use the computed Information Gain for attribute selection.
		 * To that end, we use a Ranker and set the threshold for the Information Gain that an attribute 
		 * has to satisfy in order to be used in further processing.
		 * In this case we will keep attributes with Information Gain score over 0, 
		 * and they will be sorted according to their score as well.		   
		 */
		Ranker ranker = new Ranker();
		ranker.setThreshold(0.0);

		/*
		 * Setting the ranker with the 0.0 threshold.
		 */
		asFilter.setSearch(ranker);
		
		/*
		 * Creating MultiFilter, because we use two filters:
		 * 1. StringToWordFilter, and
		 * 2. AttributeSeletionFilter.
		 */
		Filter[] filters = new Filter[2];
		filters[0] = textToWordfilter;
		filters[1] = asFilter;

		MultiFilter multiFilter = new MultiFilter();
		multiFilter.setInputFormat(trainingData);
		multiFilter.setFilters(filters);

		/*
		 * Now we are creating the classifier based on these filtered data.
		 * In this code, you may choose between the J48 classifier and the Naive Bayes classifier. 
		 */

		/*
		 * If you want to use the J48 classifier, uncomment the following line:
		 */
		//	J48 classifier = new J48();

		/*
		 * Creating the Naive Bayes classifier
		 */
		NaiveBayes classifier = new NaiveBayes();
		
		/*
		 * Setting the supervised discretization to true, because our attributes (tokens)
		 * have numeric values, which need to be discretized.
		 */
//		classifier.setUseSupervisedDiscretization(true);
		
		filteredClassifier = new FilteredClassifier();
		filteredClassifier.setClassifier(classifier);
		
		/*
		 * Adding the filter.
		 */
		filteredClassifier.setFilter(multiFilter);
		
		/*
		 * Adding data.
		 */
		filteredClassifier.buildClassifier(trainingData);

		/*
		 * Serializing the built filtered classifier into a file in the "classifiers" folder
		 */
		SerializationHelper.write(classifierPath, filteredClassifier);

		/*
		 * Evaluating the classifier on the training set.
		 * You may try creating a distinct test set and evaluating the classifier on it.
		 */
		Evaluation eval = new Evaluation(trainingData); 
		eval.evaluateModel(filteredClassifier, trainingData);
		
		/*
		 * Printing the evaluation summary with the confusion matrix 
		 */
		System.out.println(filteredClassifier);
		System.out.println(eval.toSummaryString()); 
		System.out.println(eval.toMatrixString());   
	}
	
	/*
	 * A method for classifying a particular text
	 */
	public void checkTextInstace(String text) throws Exception {
		
		/*
		 * We create a new instance of the data having the same attributes as the original training dataset.
		 * First, we create the "laguage_class" attribute with possible values: EN, FR, and SP.
		 */
		ArrayList<String> languageClasses = new ArrayList<>();
		languageClasses.add("EN");
		languageClasses.add("FR");
		languageClasses.add("SP");
		Attribute languageAttr = new Attribute("language_class", languageClasses);
		
		/*
		 * Next, we create the text attribute
		 */
		Attribute textAttr = new Attribute("text", (List<String>) null);

		/*
		 * Now we create a list of these attributes.
		 */
		ArrayList<Attribute> attributes = new ArrayList<Attribute>();
		attributes.add(languageAttr);
		attributes.add(textAttr);

		/*
		 * Creating a new data instance in order to test the classifier.
		 * Arguments include: the name of the new dataset ("test data"), the attributes,
		 * and the data length. Since we have only one sentence, only one text instance, we set the
		 * length to 1.
		 */
		Instances data = new Instances("test data", attributes, 1);
		
		/*
		 * Setting the index of the class attribute. 
		 * In our case, the first attribute "language_class" is the class attribute.
		 * Important: The order of attributes in this dataset needs to be the same as the 
		 * order of attributes in the training dataset!
		 */
		data.setClassIndex(0);

		/*
		 * Creating the particular instance in order to be put in the new dataset of Instances.
		 * We use the text provided as an argument to this method. 
		 */
		Instance instance = new DenseInstance(2);
		instance.setValue(textAttr, text);
		instance.setDataset(data);
		
		/*
		 * We add this one instance to our new dataset
		 */
		data.add(instance);
		
		/*
		 * The result of the classification process is the number of the predicted attribute value:
		 * 0 for EN, 1 for FR, 2 for SP. 
		 * Then we need to covert it to Integer and get its class attribute value, "EN", "FR", or "SP".
		 * That is what we will print.  
		 */
		double pred = filteredClassifier.classifyInstance(data.firstInstance());
		System.out.println("Class predicted: " + data.classAttribute().value((int) pred));
	}

}
