/*
 * 
 *  Copyright (C) 2011 Mateusz Kopec
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 *
 */
package utils;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

import corpusapi.ContinueMode;
import corpusapi.Corpus;
import corpusapi.CorpusFactory;
import corpusapi.SenseSegmentGroup;
import corpusapi.tei.TEICorpus;
import corpusapi.tei.TEICorpusText;
import corpusapi.tei.TEISegment;
import corpusapi.tei.TEISenseInventory;
import corpusapi.tei.TEISenseSegmentGroup;
import evaluation.AnnotationStats;

/**
 * Helper for corpus in tei format
 * 
 * @author Mateusz Kopec
 * 
 */
public class CorpusManager {
	

	/**
	 * Loads corpus, given corpus config file path
	 * 
	 * @param configFilePath
	 * @return corpus
	 * @throws Exception
	 */
	public static TEICorpus getCorpusFromConfigFile(String configFilePath) throws Exception {
		System.out.println("Loading corpus from: " + configFilePath);
		final CorpusFactory factory = CorpusFactory.getInstance();
		final TEICorpus corpus = (TEICorpus) factory.getCorpus(configFilePath, true);
		corpus.open();
		System.out.println("Corpus with " + corpus.getCorpusTextIds().size() + " texts loaded.");
		return corpus;
	}

	/**
	 * Gets final evaluation part of wypluwka
	 * 
	 * @return corpus
	 * @throws Exception
	 */
	public static TEICorpus getWypluwkaForFinalEvaluation() throws Exception {
		final String configFile = "data/corpora/wypluwkaPart2.xml";
		return getCorpusFromConfigFile(configFile);
	}

	/**
	 * Gets development part of wypluwka
	 * 
	 * @return corpus
	 * @throws Exception
	 */
	public static TEICorpus getWypluwkaForDevelopment() throws Exception {
		final String configFile = "data/corpora/wypluwkaPart1.xml";
		return getCorpusFromConfigFile(configFile);
	}

	/**
	 * Samples corpus for a number of texts and saves them in a given directory
	 * 
	 * @param c
	 *            corpus
	 * @param textCount
	 *            number of texts to choose
	 * @param targetPath
	 *            path to save texts
	 */
	public static void getSampleFromCorpus(TEICorpus c, int textCount, String targetPath) {
		Random r = new Random(1);
		List<String> cids = c.getCorpusTextIds();
		while (cids.size() > textCount) {
			cids.remove(r.nextInt(cids.size()));
		}
		System.out.println("Randomly selected a sample of " + textCount + " texts.");

		System.out.println("Copying selected texts into " + targetPath);

		for (String tid : cids) {
			TEICorpusText t = c.getCorpusText(tid);
			String from = t.getPath();
			String to = targetPath + from.replaceAll("\\.\\./", "");

			try {
				FileManager.copyDirectory(new File(from), new File(to));
			} catch (IOException e) {
				System.out.println("FROM " + from);
				System.out.println("TO " + to);
				e.printStackTrace();
				System.exit(1);
			}
		}
		System.out.println("Done");
	}

	/**
	 * Splits corpus into two
	 * 
	 * @param c
	 *            corpus
	 * @param proportion
	 *            should be between 0 and 1
	 * @param targetPath1
	 *            path to save first part
	 * @param targetPath2
	 *            path to save second part
	 */
	public static void splitCorpus(TEICorpus c, float proportion, String targetPath1, String targetPath2) {
		Random r = new Random(1);
		List<String> originalTextIds = c.getCorpusTextIds();
		List<String> cids1 = new ArrayList<String>(originalTextIds);
		List<String> cids2 = new ArrayList<String>(originalTextIds);

		int size1 = (int) (originalTextIds.size() * proportion);
		int size2 = originalTextIds.size() - size1;

		while (cids1.size() > size1) {
			cids1.remove(r.nextInt(cids1.size()));
		}
		cids2.removeAll(cids1);

		System.out.println("All texts  : " + originalTextIds.size() + "\t in original corpus");
		System.out.println("First part : " + size1 + " texts \t being copied into: " + targetPath1);
		System.out.println("Second part: " + size2 + " texts \t being copied into: " + targetPath2);

		for (String tid : cids1) {
			TEICorpusText t = c.getCorpusText(tid);
			String from = t.getPath();
			String to = targetPath1 + from.replaceAll("\\.\\./", "");

			try {
				FileManager.copyDirectory(new File(from), new File(to));
			} catch (IOException e) {
				System.out.println("FROM " + from);
				System.out.println("TO " + to);
				e.printStackTrace();
				System.exit(1);
			}
		}
		for (String tid : cids2) {
			TEICorpusText t = c.getCorpusText(tid);
			String from = t.getPath();
			String to = targetPath2 + from.replaceAll("\\.\\./", "");

			try {
				FileManager.copyDirectory(new File(from), new File(to));
			} catch (IOException e) {
				System.out.println("FROM " + from);
				System.out.println("TO " + to);
				e.printStackTrace();
				System.exit(1);
			}
		}
		System.out.println("Done");
	}

	/**
	 * Calculates gold standard annotation in corpus
	 * 
	 * @param corpus
	 * @param dict
	 *            dictionary of senses
	 * @return annotation stats
	 */
	public static AnnotationStats getSenseStatisticsForCorpus(Corpus corpus, TEISenseInventory dict) {
		AnnotationStats result = new AnnotationStats(dict);

		// for each text in corpus
		for (String corpusTextId : corpus.getCorpusTextIds()) {
			TEICorpusText corpusText = (TEICorpusText) corpus.getCorpusText(corpusTextId);

			try {
				// read all gold standard annotations for given text
				TEISenseSegmentGroup currentSsg = (TEISenseSegmentGroup) corpusText
						.getFirstSegmentGroup(SenseSegmentGroup.class);
				while (currentSsg != null) {

					String senseId = currentSsg.getSenseId().split("#")[1];
					String lexeme = senseId.split("\\.")[0];

					if (!lexeme.equalsIgnoreCase("null")) {
						Counter c = result.get(lexeme);
						if (c == null) {
							c = new Counter();
							result.put(lexeme, c);
						}
						c.increase(senseId);
					}
					currentSsg = currentSsg.getNext(ContinueMode.ALWAYS_CONTINUE);
				}

			} catch (Exception e) {
				e.printStackTrace();
				System.out.println(corpusText.getPath() + " Skipping text because of: "
						+ e.getMessage().replaceAll("\\n", " "));
				continue;
			} finally {
				corpusText.closeCorpusText();
			}
		}

		for (String senseId : dict.getSenseEntries().keySet()) {

			String orth = dict.getSenseEntries().get(senseId).getOrthForm();
			String pos = dict.getSenseEntries().get(senseId).getPOS();

			result.senseIdToOrth.put(senseId, orth);
			result.senseIdToPos.put(senseId, pos);
		}

		return result;
	}

	/**
	 * Prints some statistics about the corpus
	 * 
	 * @param corpus
	 */
	public static void printCorpusStats(TEICorpus corpus) {
		int textCount = 0;
		int segmentCount = 0;

		// for each text in corpus
		for (String corpusTextId : corpus.getCorpusTextIds()) {
			TEICorpusText corpusText = null;
			try {

				corpusText = (TEICorpusText) corpus.getCorpusText(corpusTextId);
				textCount++;

				TEISegment seg = corpusText.getFirstSegment();
				while (seg != null) {
					segmentCount++;

					seg = seg.getNext();
				}
			} catch (Exception e) {
				e.printStackTrace();
				System.exit(1);
			} finally {
				corpusText.closeCorpusText();
			}

		}
		System.out.println("Texts: " + textCount);
		System.out.println("Segments: " + segmentCount);
	}
}
