package corpusapi.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Vector;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.Characters;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
import uk.ac.shef.wit.simmetrics.similaritymetrics.Levenshtein;

/**
 * podkreśla w przykladach (<quote>) wystapienia slowa wieloznacznego (0-inf)
 * do jednorazowego odpalenia
 * 
 *  
 * @author Project Manager
 *
 */

public class NKJP_WSI_XML_UPGRADER {

	static Vector<Vector<String>> takipiBases = new Vector<Vector<String>>();
	static Vector<Vector<String>> takipiOrths = new Vector<Vector<String>>();
	static int index = 0; //ktory quote teraz leci (index do tablic takipiBases, i orths

	static int counter = 0; //pomocniczne
	
	static AbstractStringMetric abs = new Levenshtein(); 
	
	//ZEBY DZIALALO, TRZEBA ODPALIC TAKIPI (1.8 uzywano) NA NKJP_WSI.xml w domyslnym trybie
	
	public static void main(String [] args) {
		
		try {
			boolean inQuote = false;
			boolean inOrth = false;
			boolean inBase = false;
			boolean freshBaseInTok = true;
			
			Vector<String> currentBase = null;
			Vector<String> currentOrth = null;
			
			String orth = null;
			String base = null;
			
			XMLInputFactory factory = XMLInputFactory.newInstance();
			FileReader reader;
			try {
				reader = new FileReader("data/NKJP_WSI.xml.takipized");
				XMLEventReader eventReader = factory.createXMLEventReader(reader);
				while (eventReader.hasNext()) {
					XMLEvent event = eventReader.nextEvent();
					if (event.isStartElement()) {
						StartElement element = (StartElement) event;
											
						
						if ("quote".equalsIgnoreCase(element.getName().getLocalPart())) {
							inQuote = true;
							//currentQuoteTakipi = new LinkedHashMap<String, String>();
							currentBase = new Vector<String>();
							currentOrth =  new Vector<String>();
							
						}
						
						if ("tok".equalsIgnoreCase(element.getName().getLocalPart())) {
							freshBaseInTok = true;
						}
						
						if ("orth".equalsIgnoreCase(element.getName().getLocalPart())) {
							
							inOrth = true;
						}
						
						if ("base".equalsIgnoreCase(element.getName().getLocalPart())) {	
								inBase = true;
								//System.out.println("XXXXXXXXXXXXX");	
							
							//freshBaseInOrth = true;
							
						}

					}
					
					
					if (event.isCharacters()) {
						Characters element = (Characters) event;
						
						if (inQuote) {
						
							if (inOrth) {//tytul calego sens inwentory
								orth = element.getData();
							}
							
							if (freshBaseInTok && inBase) {//tytul calego sens inwentory
								base = element.getData();
							}
						}
						
					}
			

					
					
					if (event.isEndElement()) {
						EndElement element = (EndElement) event;
						
						if ("quote".equalsIgnoreCase(element.getName().getLocalPart())) {
							inQuote = false;
							takipiBases.add(currentBase);
							takipiOrths.add(currentOrth);
							//quotesTakipi.add(currentQuoteTakipi);
							
						}
						
						if ("orth".equalsIgnoreCase(element.getName().getLocalPart())) {
							inOrth = false;
							
						}
						
						if ("tok".equalsIgnoreCase(element.getName().getLocalPart())) {
							if (inQuote) {
								currentBase.add(base);
								currentOrth.add(orth);
								//currentQuoteTakipi.put(base, orth);
							}
						}
						if ("lex".equalsIgnoreCase(element.getName().getLocalPart())) {
							freshBaseInTok = false;
						}
						
						if ("base".equalsIgnoreCase(element.getName().getLocalPart())) {
							inBase = false;
						}
					
					}
				}
				reader.close();
			} catch (FileNotFoundException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (XMLStreamException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}

			
			//KONIEC ZABAWY Z TAKIPI

			
			BufferedReader cr = new BufferedReader(new FileReader("data/NKJP_WSI.xml"));
			BufferedWriter br = new BufferedWriter(new FileWriter("data/NKJP_WSI_UPGRADED.xml"));
			StringBuilder sb =  new StringBuilder();
			
			String strLine;
			String baseForm = null;
			while ((strLine = cr.readLine()) != null)   {
				if (strLine.trim().startsWith("<orth>")) {
					baseForm = strLine.trim().substring(6,strLine.trim().length()-7);
				}
				if (strLine.trim().startsWith("<quote>")) {
			    	//System.out.println(upgradeQuoteSimple(strLine,baseForm));
			    	sb.append(upgradeQuote(strLine,baseForm));
			    	sb.append(System.getProperty("line.separator"));
			    	
			    }  else {
			    	sb.append(strLine);
			    	sb.append(System.getProperty("line.separator"));
				}
			}
		//	System.out.println(sb);
			br.write(sb.toString());
			br.close(); 
			cr.close();
		} catch (Exception e){//Catch exception if any
		    System.err.println("Error: " + e.getMessage());
		}
		//System.out.println(counter);

	}
	
	
	
	private static String upgradeQuoteSimple(String line, String base) {
		
		String nonchar = "[^abcdefghijklmnopqrstuwxyzęóąśłżźćńABCDEFGHIJKLMNOPQRSTUWXYZĘÓĄŚŁŻŹĆŃ1234567890]"; 
		String [] splitted = line.trim().split(nonchar);
		//float [] score = new float[splitted.length-2];
		float score;
		
		float max = -1;
		int maxIndex = 0;
		for (int i = 1; i < splitted.length-1; i++) {
			//System.out.println(splitted[i]+ " "+i);
			if (splitted[i].equals(" ") | splitted[i].equals("quote") | splitted[i].equals("")) {continue;}
			score = abs.getSimilarity(base,splitted[i]);
			if (score>=max) {
				max = score;
				maxIndex=i;
			}
		}
		
		if (max>0.5) {//PRAWIE NA PEWNO OK
			line = line.replace(splitted[maxIndex], "<hi rend=\"bold\">"+splitted[maxIndex]+"</hi>");
		} else {//TE POWINIEN KTOS PRZEJRZEC (sa zaznaczone UPPERCASEM wartosci atrybutu)
			//line = line.replace(splitted[maxIndex], "<hi rend=\"BOLD\">"+splitted[maxIndex]+"</hi>");
			line = line.replace(splitted[maxIndex], "#"+splitted[maxIndex]+"$");
		}
		
		
		return line;
	}
	
	
	
	
	/**
	 * uzywa takipi, gdy takipi nie moze sie zdecydowac uzywa heurystyki 
	 * @param line
	 * @param base
	 * @return
	 */
	private static String upgradeQuote(String line,String base) {
		Vector<String> tbs = takipiBases.get(index);
		Vector<String> tos = takipiOrths.get(index);
		index++;
		
		Vector<String> orthsToReplace = new Vector<String>(); //okazalo sie, ze zawsze tylko raz trafial
		
		for (int i = 0; i < tbs.size(); i++) {
			//System.out.println(tos.get(i));
			
			if (tbs.get(i).equals(base)) {
				orthsToReplace.add(tos.get(i));
				
				//line+=" "+tos.get(i);							
			}
			
		}
		
		for (String rep : orthsToReplace) {
			int start = line.indexOf(rep);
			int end = start;
			while (Character.isLetterOrDigit(line.charAt(start-1))) {
				start--;
			}
			while (Character.isLetterOrDigit(line.charAt(end))) {
				end++;
			}
			String tmp = line.substring(0,start)+"<hi rend=\"bold\">"+line.substring(start,end)+"</hi>"+line.substring(end);
			line = tmp;
		}
		
		if (orthsToReplace.size()==0) {
			return upgradeQuoteSimple(line, base);
		} 
		
		
		return line;
	}
	
}