/*
 * 
 *  Copyright (C) 2011 Mateusz Kopec
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 *
 */

package resources;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import utils.CorpusManager;
import utils.Counter;
import corpusapi.ContinueMode;
import corpusapi.tei.TEICorpus;
import corpusapi.tei.TEICorpusText;
import corpusapi.tei.TEIMorphoSegmentGroup;
import corpusapi.tei.TEISegment;

/**
 * Container for word frequency statistics
 * 
 * @author Mateusz Kopec
 * 
 */
public class FrequencyCounter {

	private Map<String, Double> idfs = new HashMap<String, Double>();
	private Map<String, Double> ilfs = new HashMap<String, Double>();

	private int documentCount = 0;
	private Counter counts = new Counter();
	private Counter documentCounts = new Counter();

	/**
	 * Load stats from file
	 * 
	 * @param frequencyCounterPath
	 *            path to file
	 */
	public FrequencyCounter(String frequencyCounterPath) {
		loadFromFile(frequencyCounterPath);
	}

	/**
	 * Load stats from corpus
	 * 
	 * @param corpus
	 */
	public FrequencyCounter(TEICorpus corpus) {
		loadFromCorpus(corpus);
	}

	private void loadFromFile(String path) {
		getCountsFromFile(path);
		calculateMeasures();
	}

	private void loadFromCorpus(TEICorpus corpus) {
		getCountsFromCorpus(corpus);
		calculateMeasures();
	}

	/**
	 * Saves statistics in a given file.
	 * 
	 * @param path
	 */
	private void saveIntoFile(String path) {
		List<Entry<String, Integer>> l = new ArrayList<Entry<String, Integer>>(counts.entrySet());
		Collections.sort(l, new Comparator<Entry<String, Integer>>() {

			@Override
			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
				return o2.getValue().compareTo(o1.getValue());
			}
		});

		BufferedWriter bw = null;
		try {
			bw = new BufferedWriter(new FileWriter(new File(path)));
			bw.write(documentCount + "\n");
			for (Entry<String, Integer> w : l) {
				bw.write(w.getKey() + ":" + w.getValue() + ":" + documentCounts.get(w.getKey()) + "\n");
			}
		} catch (IOException e) {
			e.printStackTrace();
			System.exit(1);
		} finally {
			try {
				bw.flush();
				bw.close();
			} catch (IOException e) {
				e.printStackTrace();
				System.exit(1);
			}
		}
	}

	private void calculateMeasures() {
		for (Entry<String, Integer> w : documentCounts.entrySet())
			idfs.put(w.getKey(), Math.log(1.0 * documentCount / (1 + w.getValue())));

		for (Entry<String, Integer> w : counts.entrySet())
			ilfs.put(w.getKey(), 1.0 / Math.log(1 + w.getValue()));
	}

	private void getCountsFromCorpus(TEICorpus corpus) {
		System.out.println("Calculation freqs...");

		int processedSegments = 0;
		int processedTexts = 0;

		int numberOfTexts = corpus.getCorpusTextIds().size();
		int c = 0;

		for (String corpusTextId : corpus.getCorpusTextIds()) {

			if (++c % (numberOfTexts / 10) == 0)
				System.out.println("Processing text nr: " + c);

			TEICorpusText corpusText = corpus.getCorpusText(corpusTextId);
			int segmentsInText = 0;
			TEISegment segment = null;

			Collection<String> basesInText = new HashSet<String>();
			try {
				// get first segment in text
				segment = corpusText.getFirstSegment();
				if (segment == null) {
					throw new Exception("NULL first segment!");
				}

				// iterate through all segments in text and find polysemous ones
				do {
					segmentsInText++;

					TEIMorphoSegmentGroup morph = (TEIMorphoSegmentGroup) segment
							.getSegmentGroup(TEIMorphoSegmentGroup.class);
					String baseForm = morph.getChosenInterpretation().getBase();

					if (!baseForm.contains(":") && !baseForm.equals("")) {
						counts.increase(baseForm);
						basesInText.add(baseForm);
					}

				} while ((segment = segment.getNext(ContinueMode.ALWAYS_CONTINUE)) != null);

				processedSegments += segmentsInText;
				processedTexts++;

				for (String w : basesInText)
					documentCounts.increase(w);

			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(corpusText.getPath() + " Skipping text because of: "
						+ e.getMessage().replaceAll("\\n", " "));
				continue;
			} finally {
				corpusText.closeCorpusText();
			}
		}

		documentCount = processedTexts;

		System.out.println("Processed " + processedTexts + " texts out of " + corpus.getCorpusTextIds().size());
		System.out.println("Processed segments: " + processedSegments);
		System.out.println("Done.");
	}

	private void getCountsFromFile(String path) {
		System.out.println("Loading freqs from file...");

		BufferedReader br = null;
		try {
			br = new BufferedReader(new FileReader(new File(path)));
			String line = br.readLine();
			documentCount = Integer.valueOf(line);

			while ((line = br.readLine()) != null) {
				String[] spl = line.split(":");
				String word = spl[0];
				int count = Integer.valueOf(spl[1]);
				int docCount = Integer.valueOf(spl[2]);
				counts.put(word, count);
				documentCounts.put(word, docCount);
			}
		} catch (FileNotFoundException e) {
			e.printStackTrace();
			System.exit(1);
		} catch (IOException e) {
			e.printStackTrace();
			System.exit(1);
		} finally {
			try {
				br.close();
			} catch (IOException e) {
				e.printStackTrace();
				System.exit(1);
			}
		}

		System.out.println("Done");
	}

	/**
	 * Gets inverse document frequency for a lexeme
	 * 
	 * @param lexeme
	 * @return idf
	 */
	public double getIDF(String lexeme) {
		Double d = idfs.get(lexeme);
		if (d == null)
			return Math.log(1.0 * documentCount);
		else
			return d;
	}

	/**
	 * Gets inverse lexeme frequency for a lexeme
	 * 
	 * @param lexeme
	 * @return ilf
	 */
	public double getILF(String lexeme) {
		Double d = ilfs.get(lexeme);
		if (d == null)
			return 1;
		else
			return d;
	}

	/**
	 * Entry point to frequency counter program. Needs two arguments: path to
	 * the corpus config file, and output file path. It calcutates frequency
	 * statistics for the corpus and saves them in a given file.
	 * 
	 * @param args
	 */
	public static void main(String[] args) {
		if (args.length != 2) {
			System.out.println("Wrong number of arguments specified.");
			System.out.println("Should be: FrequencyCounter.jar corpusConfigFilePath outputPath");
			System.exit(1);
		}

		String corpusPath = args[0];
		String outputFilePath = args[1];

		File corp = new File(corpusPath);
		if (!corp.exists() || !corp.isFile()) {
			System.out.println("Corpus config file doesn't exist.");
			System.exit(1);
		}

		FrequencyCounter fc;
		try {
			fc = new FrequencyCounter(CorpusManager.getCorpusFromConfigFile(corpusPath));
			fc.saveIntoFile(outputFilePath);
		} catch (Exception e) {
			System.out.println("Error creating corpus statistics. Details follow.");
			e.printStackTrace();
			System.exit(1);
		}
	}
}
