/*
 * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej
 * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish
 * Academy of Sciences; cf. www.ipipan.waw.pl).  All rights reserved.
 *
 * This file is part of WSDDE.
 *
 * WSDDE is free software: it may be distributed and/or modified under
 * the terms of the GNU General Public License version 3 as published
 * by the Free Software Foundation and appearing in the file doc/gpl.txt
 * included in the packaging of this file.
 *
 * A commercial license is available from IPI PAN (contact
 * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more
 * information).  Licensees holding a valid commercial license from IPI
 * PAN may use this file in accordance with that license.
 *
 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING
 * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE.
 */

package wsdde.corpus.knowledge;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.Vector;

import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.helpers.XMLReaderFactory;
import org.xml.sax.helpers.DefaultHandler;

import wsdde.corpus.ContextInfo;
import wsdde.corpus.KWIC;
import wsdde.corpus.POSInfo;
import wsdde.corpus.WSDCorpus;



public class TakipiXMLParser extends DefaultHandler
{
	
	KWIC k = null;
	WSDCorpus wsdc = null;
	Vector<String> typeOfInfo = null;
	 
	
	static int id=0;
	
	StringBuffer sb;
	
	
	boolean isKeyword = false;
	
	String orthForm = "";
	String baseForm = "";
	String gramInfo = "";
	String sense = "";

	boolean listenForDisamb = true;
	boolean listenForNextBaseAndCtag = false;
	
    public static void main (String args[])
    {
    	TakipiXMLParser txp = new TakipiXMLParser();
    	WSDCorpus w= txp.parse("jezyk.xml.takipized");
    	System.out.println(w.get(0));
    	
    	
    	
	
    }

    public TakipiXMLParser ()
    {
    	super();  	
    }

    public WSDCorpus parse(String filename) {
    	wsdc = new wsdde.corpus.WSDCorpus(filename);
    	//k = new wsdde.corpus.KWIC();
    	typeOfInfo = new Vector<String>(Arrays.asList(new String []{"words","lemmas","posinfos"}));
    	sb = new StringBuffer();
    
    	XMLReader xr;
		try {
			xr = XMLReaderFactory.createXMLReader();
			xr.setContentHandler(this);
			xr.setErrorHandler(this);
			//FileReader r = new FileReader(filename);
			InputStreamReader r = new InputStreamReader(new FileInputStream(filename), "UTF-8");
			xr.parse(new InputSource(r));
		} catch (SAXException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
    	return wsdc;
    }

    ////////////////////////////////////////////////////////////////////
    // Event handlers.
    ////////////////////////////////////////////////////////////////////


    public void startElement (String uri, String name,
			      String qName, Attributes atts)
    {
    	if (qName.equalsIgnoreCase("context")) {
    		k = new KWIC(++id);
    	}
    	if (qName.equalsIgnoreCase("sense")) {
    		isKeyword = true;
    		k.sense = atts.getValue("label");
    	}
    	if (qName.equalsIgnoreCase("tok")) {
    		listenForDisamb = true;
    	}
    	if (qName.equalsIgnoreCase("lex")) {
    		if (atts.getLength()==1) {
    			listenForDisamb = false;
    			listenForNextBaseAndCtag = true;
    		}
    	}
    	if (qName.equalsIgnoreCase("orth")) {
    		sb = new StringBuffer();
    	}
    	if (qName.equalsIgnoreCase("base")) {
    		if (listenForNextBaseAndCtag) sb = new StringBuffer();
    	}
    	if (qName.equalsIgnoreCase("ctag")) {
    		if (listenForNextBaseAndCtag) sb = new StringBuffer();
    	}
    	
    }


    public void endElement (String uri, String name, String qName)
    {
    	if (qName.equalsIgnoreCase("context")) {
    		wsdc.add(k);
    	}
    	if (qName.equalsIgnoreCase("orth")) {
    		orthForm = sb.toString();
    	}
    	if (qName.equalsIgnoreCase("base")) {
    		if (listenForNextBaseAndCtag) baseForm = sb.toString();
    	}
    	if (qName.equalsIgnoreCase("ctag")) {
    		if (listenForNextBaseAndCtag) {
    			gramInfo = sb.toString();
    			listenForNextBaseAndCtag = false;
    		}
    	}
    	if (qName.equalsIgnoreCase("tok")) {
    		Vector<ContextInfo> info = new Vector<ContextInfo>(Arrays.asList(new ContextInfo []{new ContextInfo(orthForm),new ContextInfo(baseForm),new POSInfo(gramInfo)}));
    		k.wstaw(typeOfInfo, info, isKeyword);
    		isKeyword=false;
    		baseForm = "";
    		orthForm = "";
    		gramInfo = "";
    		sb = new StringBuffer();
    		
    		
    	}
    	if (qName.equalsIgnoreCase("wsdcorpus")) {
    		wsdc.sensesFromKWICS();
    	}
    }


    public void characters (char ch[], int start, int length)
    {
    	
    	for (int i = start; i < start + length; i++) {
    		sb.append(ch[i]);
    	}
    }

}