/*
 * Authors: 
 * Uros Krcadinac - uros@krcadinac.com
 * Nikola Milikic - nikola.milikic@gmail.com
 * 
 * GOOD OLD AI Research Lab - http://goodoldai.org
 * Intelligent Systems (FON, University of Belgrade) - http://is.fon.rs/
 * 
 * November, 2013.
 * 
 * This code is partially based on these very useful resources for text mining in Weka:
 * http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html
 * http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html
 * http://jmgomezhidalgo.blogspot.com.es/2013/06/baseline-sentiment-analysis-with-weka.html
 * 
 * (Note: if you plane to use the code from these sites, please use Weka 3.7. 
 * Our code is using Weka 3.6. There are some differences in terms of API.) 
 * 
 */

package rs.fon.is.weka.textmining.sa;

import java.util.ArrayList;
import java.util.List;

import weka.attributeSelection.InfoGainAttributeEval;
import weka.attributeSelection.Ranker;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.bayes.NaiveBayes;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.J48;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SelectedTag;
import weka.core.SerializationHelper;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.tokenizers.NGramTokenizer;
import weka.filters.Filter;
import weka.filters.MultiFilter;
import weka.filters.supervised.attribute.AttributeSelection;
import weka.filters.unsupervised.attribute.StringToWordVector;

public class SentimentAnalysis {

	/*
	 * Weka classifier for text classification  
	 */
	private FilteredClassifier filteredClassifier;
	
	/*
	 * Weka classifier serialized as a separate file in the folder "classifiers" 
	 */
	private String classifierPath;
	
	/*
	 * Path to the ARFF file with the initial data 
	 */
	private String arffFileName;
	
	/*
	 * This constructor either:
	 * (1) loads a classifier from a file (if there is an appropriate file in the "classifiers" folder); or
	 * (2) builds a classifier from scratch (if there is no such file).
	 */
	public SentimentAnalysis(String classifierPath, String arffFileName) {
		this.classifierPath = classifierPath;
		this.arffFileName = arffFileName;
		
		loadOrBuildClassifier();
	}
	
	/*
	 * If there is no classifier file, the exception will be caught in the first catch block,
	 * and the new classifier will be built and serialized in a file.  	 
	 */
	public Classifier loadOrBuildClassifier() {
		try {
			filteredClassifier = (FilteredClassifier) SerializationHelper.read(classifierPath);
		} catch (Exception e) {
			try {
				buildClassifier();
			} catch (Exception e1) {
				e1.printStackTrace();
			}
		}
		return filteredClassifier;
	}

	/*
	 * The most important method for building a classifier.
	 */
	public void buildClassifier() throws Exception {
		
		/*
		 * Loading training data 
		 */
		DataSource loader = new DataSource(arffFileName); 
		Instances trainingData = loader.getDataSet();
		
		/*
		 *  Setting the index of the class attribute. 
		 *  In our case, the second attribute "sentiment_class" is the class attribute.
		 *  (Note that in the previous example the first attribute was the class attribute, thus 0.) 
		 */
		trainingData.setClassIndex(1);

		
		/*
		 *  Creating StringToWordVector filter, the most important text mining function in WEKA.
		 *  Converting String attributes (original text data) into a set of attributes 
		 *  representing word occurrence (depending on the tokenizer) information 
		 *  from the text contained in the strings. The set of words (attributes) 
		 *  is determined by the first batch filtered (typically training data).
		 */
		StringToWordVector textToWordfilter = new StringToWordVector();
		
		/*
		 * The main difference between Language Identification and Sentiment Analysis is
		 * in the usage of the NGram Tokenizer. A trivial analysis of the problem easily 
		 * drives us to think that multi-word expressions (e.g. "very bad" vs. "bad", or "a must" vs. 
		 * "I must") can lead to better predictors of user sentiment or opinion about an item. 
		 * Because of this, we will compare word n-grams vs. single words (or unigrams).
		 */
		NGramTokenizer tokenizer = new NGramTokenizer(); 
		tokenizer.setNGramMinSize(1); 
		tokenizer.setNGramMaxSize(2); 
		
		/*
		 * Adding the NGram Tokenizer to the filter
		 */
		textToWordfilter.setTokenizer(tokenizer);
		
		/*
		 * set to which attribute the filter should be applied to 
		 */
		textToWordfilter.setAttributeIndices("first");
		// alternative way to define the index of the attribute:
		//textToWordfilter.setAttributeIndicesArray(new int[] {0});
		
		/*
		 * set the prefix to be assigned to the newly created attributes 
		 * so that they can be distinguished from the original attributes
		 */
		textToWordfilter.setAttributeNamePrefix("W_");
		
		/*
		 * Set the number of words to keep per each class
		 */
		textToWordfilter.setWordsToKeep(3000);
		
		/*
		 * set all words to small letter
		 */
		textToWordfilter.setLowerCaseTokens(true);
		
		/*
		 * normalize word frequencies at the document level; this is particularly relevant when 
		 * documents in the corpus vary in terms of their length (number of words)  
		 */
		textToWordfilter.setNormalizeDocLength(new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, 
																StringToWordVector.TAGS_FILTER));
			
		/*
		 * Adding the data
		 */
		textToWordfilter.setInputFormat(trainingData);
		
		
		/*
		 * Create attribute Selection Filter in the same way as in the previous example
		 */
		AttributeSelection asFilter = new AttributeSelection();
		asFilter.setEvaluator(new InfoGainAttributeEval());
		
		Ranker ranker = new Ranker();
		ranker.setThreshold(0.0);
		asFilter.setSearch(ranker);

		/*
		 * Creating MultiFilter, because we use two filters:
		 * 1. StringToWordFilter, and
		 * 2. AttributeSeletionFilter.
		 */
		
		Filter[] filters = new Filter[2];
		filters[0] = textToWordfilter;
		filters[1] = asFilter;
		
		MultiFilter multiFilter = new MultiFilter();
		multiFilter.setFilters(filters);
		
		/*
		 * Now we are creating the classifier based on these filtered data.
		 * In this code, you may choose between the J48 classifier and the Naive Bayes classifier. 
		 */

		/*
		 * If you want to use the J48 classifier, uncomment the following line:
		 */
		J48 classifier = new J48();

		/*
		 * Creating the Naive Bayes classifier
		 */
//		NaiveBayes classifier = new NaiveBayes();
		
		/*
		 * Setting the supervised discretization to true, because our attributes (tokens)
		 * have numeric values, which need to be discretized.
		 */
//		classifier.setUseSupervisedDiscretization(true);

		filteredClassifier = new FilteredClassifier();
		filteredClassifier.setClassifier(classifier);
		filteredClassifier.setFilter(multiFilter);
		filteredClassifier.buildClassifier(trainingData);
		
		/*
		 * Serializing filtered classifier into a file in the "classifiers" folder
		 */
		SerializationHelper.write(classifierPath, filteredClassifier);
		
		/*
		 * Evaluating the classifier on the training set.
		 * You may try creating a distinct test set and evaluating the classifier on it.
		 */
		Evaluation eval = new Evaluation(trainingData); 
		eval.evaluateModel(filteredClassifier, trainingData); 
		
		/*
		 * Printing the evaluation summary with the confusion matrix 
		 */
		System.out.println(filteredClassifier);
		System.out.println(eval.toSummaryString()); 
		System.out.println(eval.toMatrixString());   
	}

	/*
	 * A method for classifying a particular text
	 */
	public void checkTextInstace(String text) throws Exception {
		
		/*
		 * We create a new instance of the data having the same attributes as the original training dataset.
		 * We use the class FastVector in order to create a vector of possible values for
		 * the "sentiment_class" attribute: "no" and "yes".
		 */
		ArrayList<String> sentimentClasses = new ArrayList<>(2);
		sentimentClasses.add("no");
		sentimentClasses.add("yes");
		
		/*
		 * Creating the attributes separately.
		 */
		Attribute sentimentAttr = new Attribute("sentiment_class", sentimentClasses);
		Attribute textAttr = new Attribute("text", (List<String>) null);
		
		/*
		 * Now we create a Fast Vector of these attributes.
		 */
		ArrayList<Attribute> attributes = new ArrayList<>(2);
		attributes.add(textAttr);
		attributes.add(sentimentAttr);
		
		/*
		 * Creating a new data instance in order to be checked with the classifier.
		 * Arguments include: the name of the new dataset ("test data"), the attributes themselves,
		 * and the data length. Since we have only one sentence, only one text instance, we set the
		 * length to 1.
		 */
		Instances data = new Instances("test data", attributes, 1);
		
		/*
		 * Setting the index of the class attribute. 
		 * In our case, the second attribute "sentiment_class" is the class attribute.
		 * Important: The order of attributes in this dataset need to be the same as the 
		 * order of attributes in the training dataset!
		 */
		data.setClassIndex(1);

		/*
		 * Creating the particular instance in order to be put in the new dataset of Instances.
		 * We use the text provided as an argument to this method. 
		 */
		Instance instance = new DenseInstance(2);
		instance.setDataset(data);
		instance.setValue(textAttr, text);
		
		/*
		 * We add this one instance to our new dataset
		 */
		data.add(instance);		

		/*
		 * The result of the classification process is the number of the predicted attribute value:
		 * 0 for "no", 1 for "yes" 
		 * Then we need to covert it to Integer and get its class attribute value, "no" or "yes".
		 * That is what we will print.  
		 */
		double pred = filteredClassifier.classifyInstance(instance);
		if ( data.classAttribute().value((int) pred) == "yes")
			System.out.println("Positive sentiment");
		else System.out.println("Negative sentiment");
	}

}
