/*
 * Copyright (C) 2009 by Instytut Podstaw Informatyki Polskiej
 * Akademii Nauk (IPI PAN; Institute of Computer Science, Polish
 * Academy of Sciences; cf. www.ipipan.waw.pl).  All rights reserved.
 *
 * This file is part of WSDDE.
 *
 * WSDDE is free software: it may be distributed and/or modified under
 * the terms of the GNU General Public License version 3 as published
 * by the Free Software Foundation and appearing in the file doc/gpl.txt
 * included in the packaging of this file.
 *
 * A commercial license is available from IPI PAN (contact
 * Michal.Ciesiolka@ipipan.waw.pl or ipi@ipipan.waw.pl for more
 * information).  Licensees holding a valid commercial license from IPI
 * PAN may use this file in accordance with that license.
 *
 * This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING
 * THE WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE.
 */

package wsdde.corpus;

import java.io.BufferedReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.StringTokenizer;
import java.util.Vector;

import wsdde.corpus.PoliqarpContext;
import wsdde.general.Utils;

import com.sun.corba.se.impl.encoding.OSFCodeSetRegistry.Entry;

/**
 * helper class.
 * 
 * transforms WSDCorpusik from Wroclaw into valid xml
 * @author Project Manager
 *
 */

public class KorpusPiaseckiegoLoader {

	public static String directory = "resources/WSDCorpus";
	public static LinkedHashMap<String,String> listaZadan;
	public int indeks=1;
	
	
	
	private static String clearForXML(String s) {
		s = s.replace("&", "&amp; ");
    	s = s.replace("<", "&gt; ");
    	s = s.replace("&gt; context","<context");
    	s = s.replace("&gt; /context","</context");
    	s = s.replace("&gt; sense","<sense");
    	s = s.replace("&gt; /sense","</sense");
    	//s = s.replace(">", "&lt; ");
    	//s = s.replace("'", "&apos; ");//?
    	//s = s.replace("\"", "&quot; ");//?
    	return s;
	}
	
	
	
	/*
	 * data/WSDcorpus
	 * */

	
	public static void wczytajZadania() {
		listaZadan = new LinkedHashMap<String, String>();
		File dir = new File(directory);
	    
		String[] children = dir.list();
		//for (int i = 0; i < children.length; i++) {
			//System.out.println(children[i]);
		//}
		for (int i = 0; i < children.length; i++) {
			//System.out.println(children[i]);
	    	//Vector<KwicTekstowy> vkt = this.dlaTakipiZPliku(directory+"/"+children[i]);
	    	String text = clearForXML(Utils.readFromFile("resources/WSDCorpus/"+children[i]));
			children[i]=children[i].substring(0, children[i].indexOf(".xml"));
	    	String[] split = children[i].split("[0-9]+");
			int numer = Integer.parseInt(children[i].substring(split[0].length()));
			
			
			System.out.println(split[0]+"@"+numer);
			if (!listaZadan.containsKey(split[0])) {
				listaZadan.put(split[0], "<?xml version=\"1.0\" encoding=\"UTF-8\"?><wsdcorpus>"+text);
			} else {
				String s = listaZadan.get(split[0]);
				listaZadan.put(split[0], s+text);
			}
			
			
			
			
			
			
		}
	    
	    Iterator<String> it = listaZadan.keySet().iterator();
	    while (it.hasNext()) {
			 String nazwa_zadania = it.next();
			 String text = listaZadan.get(nazwa_zadania)+"</wsdcorpus>";
			 Utils.saveInFile(text, nazwa_zadania+".xml");
		}
		
	}
	
	
	/**
	 * @param args
	 */
	
	public static void main(String[] args) {
		wczytajZadania();

	}

}
