/*
 * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej
 * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish
 * Academy of Sciences; cf. www.ipipan.waw.pl).  All rights reserved.
 *
 * This file is part of WSDDE.
 *
 * WSDDE is free software: it may be distributed and/or modified under
 * the terms of the GNU General Public License version 3 as published
 * by the Free Software Foundation and appearing in the file doc/gpl.txt
 * included in the packaging of this file.
 *
 * A commercial license is available from IPI PAN (contact
 * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more
 * information).  Licensees holding a valid commercial license from IPI
 * PAN may use this file in accordance with that license.
 *
 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING
 * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE.
 */

package wsdde.corpus;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;

import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Random;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Vector;



import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import wsdde.Counter;
import wsdde.general.Utils;
import wsdde.generator.FeatureGenerator;



public class WSDCorpus extends Vector<KWIC> {//implements Serializable {

	public String corpusName;
	public FastVector senses; 
	
	public Counter counter;
	public HashMap<String,FeatureGenerator> whichGenerator;
	public HashMap<String,Integer> featurePosition;
	
	public WSDCorpus(String corpusName) {
		this.corpusName = corpusName;
		counter = new Counter();
		whichGenerator = new LinkedHashMap<String, FeatureGenerator>();
		featurePosition = new HashMap<String, Integer>();
	}
	
	public static HashMap<String,WSDCorpus> loadedCorpora = new HashMap<String, WSDCorpus>();
	
	public static WSDCorpus getCorpus(String filename) {
		if (!loadedCorpora.containsKey(filename)) {
			loadedCorpora.put(filename, loadTXT(filename));
		}
		return loadedCorpora.get(filename);
	}
	
	/**
	 * 
	 */
	private static final long serialVersionUID = 4188059128143522859L;

	/**
	 * clears information about features, etc.
	 * 
	 * tak naprawde nie kasuje, tylko odlacza od korpusu te informacje
	 */
	public void clear() {
		whichGenerator =new LinkedHashMap<String, FeatureGenerator>();
		featurePosition = new HashMap<String, Integer>();
		counter = new Counter();
		for (KWIC kwic : this) {
			kwic.counter.clear(); //=new Counter();
			kwic.predictedSense = "";
			//moze TODO jeszcze kasowac predictedSense;
		//	kwic.infos = null;
		}
	}
	
	public void generateFeatures(Vector<FeatureGenerator> fgs) {
		for (FeatureGenerator fg : fgs) {//ekstrakcja cech, zapamietaj dla kazdej cechy w kazdym kontekscie ilosc wystapien (TD)
			fg.generate(this);
		}
	}
	
	public void computeMainCounter() {
		for (KWIC kwic : this) {
			counter.addCounter(kwic.counter, true);
		}
		Set<String> keys = counter.keySet();
		int i=0;
		for (String key : keys) {
			featurePosition.put(key, i++);
		}
	}	
	
	public void setMainCounter(Counter c) {
		this.counter= c;
		featurePosition.clear();
		Set<String> keys = counter.keySet();
		int i=0;
		for (String key : keys) {
			featurePosition.put(key, i++);
		}
	}
	
	public void sensesFromKWICS() {
		LinkedHashSet<String> hs = new LinkedHashSet<String>();
		for (KWIC kwic : this) {
			hs.add(kwic.sense);
		}
		senses = new FastVector();
		for (String sense : hs) {
			senses.addElement(sense);
		}
		
	}
	
	public void setSenses(FastVector senses) {
		this.senses = senses;
	}
	
	public void setWhichGenerator(LinkedHashMap<String, FeatureGenerator> ffg) {
		whichGenerator = ffg;
	}
	

	
	public String toString() {
		StringBuffer sb = new StringBuffer();
		sb.append("SENSES ");
		for (int i = 0; i < senses.size(); i++) {
			sb.append(senses.elementAt(i).toString());
			sb.append(' ');
		}
		sb.append("\n\n");
		for (KWIC kwic : this) {
			sb.append(kwic.toString());
		}
		//sb.append("size: ");
		//sb.append(size());
		//sb.append("\n");
		return sb.toString();
	}
	
	
	public Instances toWeka() {
		FastVector headers = new FastVector();
		FastVector attVals;
		for (String attribute : this.counter.keySet()) {

			FeatureGenerator fgtmp =this.whichGenerator.get(attribute);

			if (fgtmp==null) {System.out.println(attribute);}

			if (fgtmp.isBinary()) {
				attVals = new FastVector();

				attVals.addElement("0");
				attVals.addElement("1");
				//nie wiem czemu, ale mi sie tutaj profiler wychrzania pod winxp (weka.attribute)
				Attribute a = new Attribute(attribute, attVals);

				headers.addElement(a);

				
			} else {
				headers.addElement(new Attribute(attribute));
				
			}
		}
		headers.addElement(new Attribute("ZNACZENIE",this.senses));
		Instances data = new Instances("nazwa", headers, 0);
		
		
		//przeksztalc w macierz cech
			//musi sprawdzic skad jest cecha (koncowka) i czy generator jest binary czy ciagly
		//dla kazdej linikjki		
		for (KWIC kwic : this) {//po sumie
			int position;
			double [] row = new double[this.counter.size()+1];
			Set<String> keys = kwic.counter.keySet();
			for (String key : keys) {

				if (this.featurePosition.get(key)==null) {continue;}
				position = this.featurePosition.get(key);				
				if (this.whichGenerator.get(key).isBinary()) {//sprawdz czy jest binarna czy nie //licz wspolczynniki
					row[position]=1;
				} else {//continues feature
					int inKWIC = kwic.counter.get(key);
					int setSize = this.size();
					int hitRows = this.counter.get(key);
					row[position]=inKWIC*Math.log(setSize/(hitRows+1.0));
				}
				
			}
			int senseNumber = this.senses.indexOf(kwic.sense);
			row[row.length-1]=(senseNumber<0?Instance.missingValue():senseNumber);//this.senses.indexOf(kwic.sense);
			data.add(new weka.core.Instance(1.0,row));
			
		}
		return data;
		
	}
	
	
	
	
	/**
	 * merges corpora
	 * @param corpora
	 * @return
	 */
	public static WSDCorpus merge(Collection<WSDCorpus> corpora) {
		return null;
	}
	
	
//	LOAD & SAVE SERIALIZED
//	public static WSDCorpus load(String filename) {
//		   WSDCorpus kwics= null;
//		   FileInputStream fis = null;
//		   ObjectInputStream in = null;
//		   try
//		   {
//		     fis = new FileInputStream(filename);
//		     in = new ObjectInputStream(fis);
//		     kwics = (WSDCorpus)in.readObject();
//		     in.close();
//		   }
//		   catch(IOException ex)
//		   {
//		     ex.printStackTrace();
//		   }
//		   catch(ClassNotFoundException ex)
//		   {
//		     ex.printStackTrace();
//		   }
//		   return kwics;
//	}
//	
//	public static void save(WSDCorpus kwics, String filename) {
//		FileOutputStream fos = null;
//		ObjectOutputStream out = null;
//		try	{
//		       fos = new FileOutputStream(filename);
//		       out = new ObjectOutputStream(fos);
//		       out.writeObject(kwics);
//		       out.close();
//		}
//		catch(IOException ex) {
//		       ex.printStackTrace();
//		}
//	}
//	
	/**
	 * sluzy do podzialu duzego zbioru kontekstow na zbior treningowy i testowy
	 * dzieli (jakDzielic); dzieli biorac element zbioru coDzielic i losujac z wagami do ktorego zbioru dorzucic. zwraca tablice wielkosci tablicy jakDzielic, zawierajaca podzielone kwics
	 * @param howToSplit - tablica z procentami sumujacymi sie do stu, mowiaca jak na jakie czesi dzielic [0.80,0.15,0.05] - podziel na 3 kwicsy o wielkosci ok. 80,15,5 procent
	 * @return
	 */
//	public WSDCorpus[] aboutPercentagSplit(int [] howToSplit) {//random split
//		WSDCorpus[] results = new WSDCorpus[howToSplit.length];
//		for (int i=0; i<howToSplit.length; i++) {
//			results[i] = new WSDCorpus();//KWICS(this.name+"-"+i);
//		}
//		for (int i=1; i<howToSplit.length; i++) {
//			howToSplit[i]+=howToSplit[i-1];
//		}
//		double rand;
//		for (int i=0; i<this.size(); i++) {//dla kazdego przykladu
//			rand = Math.round(Math.random()*100);
//			for (int j=0; j<howToSplit.length; j++) {
//				if (rand<howToSplit[j]) {results[j].add(this.get(i)); break;}
//			}
//		}
//		return results;
//	}
	
	public Vector<WSDCorpus> splitInto(int nparts) {
		int toSplit = this.size();
		int [] howToSplit = new int[nparts];
		
		for(int i = 0; i < toSplit; i++){
			howToSplit[i%nparts]++;
		}
		return exactlySplit(howToSplit);
	}
	
	public Vector<WSDCorpus> exactlySplit(int [] howToSplit) {
		sensesFromKWICS();
		Random r = new Random();
		WSDCorpus copy = new WSDCorpus(null);
		for (KWIC kwic : this) {
			copy.add(kwic);
		}
		Vector<WSDCorpus> vs = new Vector<WSDCorpus>();
		for (int examples : howToSplit) {
			WSDCorpus v= new WSDCorpus(null);
			v.senses = (FastVector) senses.copy(); 
			for (int i=0; i<examples; i++) {
				v.add(copy.remove(r.nextInt(copy.size())));
			}
			vs.add(v);
		}
		return vs;
	}
	
	
	public void addCorpus(WSDCorpus wsdcorpus) {
		addAll(wsdcorpus);
		//jeszce moglby sensy jakos zaktualizowac...
	}
	
	
	
	
	
	
	
	public static void saveTXT(WSDCorpus corpus, String filename) {
	//proposal XML <wsd_corp><context id><raw><lc></lc><keyword><rc></rc></keyword><raw><enriched name="orth_form"></enriched><enriched name="base_form"></enriched><context>
		Utils.saveInFile(corpus.toString(), filename);
	}
	
	public static WSDCorpus loadTXT(String filename) {
		WSDCorpus w = new WSDCorpus(filename);
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
			String strLine;
		    
			strLine = br.readLine();
		    StringTokenizer sensesString = new StringTokenizer(strLine, " ");
		    sensesString.nextToken();
		    w.senses = new FastVector();
		    while (sensesString.hasMoreTokens()) {
				w.senses.addElement(sensesString.nextToken().trim());
			}
		    
		    KWIC k = null;
			Vector<String> typeOfInfo = new Vector<String>(Arrays.asList(new String []{"words","lemmas","posinfos"}));
		    int status = 0;
			//Read File Line By Line
		    while ((strLine = br.readLine()) != null)   {
		      // Print the content on the console
		    	strLine=strLine.trim();
		    	StringTokenizer st = new StringTokenizer(strLine, " ");
		    	if (!st.hasMoreTokens()) {continue;}
		    	String begining = st.nextToken();
		    	if (begining.equalsIgnoreCase("context")) {
		    		//wczytaj id
		    		int id = Integer.parseInt(st.nextToken());
		    		k = new KWIC(id);
		    	}
		    	if (begining.equalsIgnoreCase("sense")) {
		    		String sense = st.nextToken().trim();
		    		if (!sense.equalsIgnoreCase("null")) {
		    			k.sense = sense;
		    		}
		    	}
		    	if (begining.equalsIgnoreCase("predicted_sense")) {
		    		String sense = st.nextToken().trim();
		    		if (!sense.equalsIgnoreCase("null")) {
		    			k.predictedSense = sense;
		    		}
		    	}
		    	
		    	if (begining.equalsIgnoreCase("end_of_context")) {
		    		w.add(k);
		    		
		    	}
		    	
		    	if (begining.equalsIgnoreCase("tok")) {
		    		int num = Integer.parseInt(st.nextToken());
		    		Vector<ContextInfo> info = new Vector<ContextInfo>(Arrays.asList(new ContextInfo []{new ContextInfo(st.nextToken().trim()),new ContextInfo(st.nextToken().trim()),new POSInfo(st.nextToken().trim())}));
		    		k.wstaw(typeOfInfo, info, num==0);
		    	}
		    	
		    	//System.out.println (strLine);
		      
		    }
		    //Close the input stream
		    br.close();
		} catch (Exception e) {
			e.printStackTrace();
			System.exit(1);
		} 
		return w;
	}
	
	
	public static void main(String [] args) {
//		loadTXT("");
	//	System.exit(0);
	/*	saveTXT(load("tmin0.wsdcorp").splitInto(50).get(0),"corp.txt");
		System.out.println(loadTXT("corp.txt"));
		//System.out.println(load("tmin0.wsdcorp").splitInto(50).get(0));
		System.exit(0);
		
		WSDCorpus w = new WSDCorpus();
		for (int i=0; i<100; i++) {
			KWIC k = new KWIC(i);
			k.sense = ""+i;
			w.add(k);
		}
		
		w.splitInto(51);
		
		Vector<WSDCorpus> x = w.exactlySplit(new int [] {10,90});
*/
		
		//WSDCorpus w = TAKIPIManager.processAndRead("jezykb.xml");
		//save(w, "jezykb.wsdcorp");
		Vector<WSDCorpus> ws = loadTXT("powod.wsdc").splitInto(2);//.aboutPercentagSplit(new int [] {50,50});//70,30});
		saveTXT(ws.get(0), "tmin0.wsdcorp");
		saveTXT(ws.get(1), "tmax1.wsdcorp");
	}

	
}