package termopl;

import java.io.*;
import java.util.*;
import java.util.regex.*;

import javax.xml.stream.*;
import javax.xml.stream.events.*;

public class CorpusReader 
{
	public static final int UNKNOWN_FORMAT  = -1;
	public static final int TEXT_FORMAT     =  0;
	public static final int XCES_FORMAT     =  1;
	public static final int TEI_FORMAT      =  2;
	public static final int CONLLU_FORMAT   =  3;
	public static final int INTERNAL_FORMAT =  4;
	
	private static final Pattern patt = Pattern.compile("(.*)#(.*)#(.*)#");

	private int format;
	private int docNumber;
	private int end_char;
	private int tokenIndex;
	private long sentenceStart;
	private long sentenceEnd;
	private boolean eos;
	private File currentFile;
	private String processedFileName;
	private TXTPosReader txtPosReader;
	private XMLPosReader xmlPosReader;
	private BufferedReader textReader;
	private XMLEventReader xmlReader;
	private LinkedList<MultiWordToken> mwTokens;
	private MultiWordToken mwt;
	private Token current;
	private Token previous;

	public CorpusReader()
	{
		format = UNKNOWN_FORMAT;
		docNumber = 0;
		tokenIndex = 0;
		currentFile = null;
		textReader = null;
		xmlReader = null;
		mwTokens = null;
		mwt = null;
		current = previous = null;
	}
	
	public CorpusReader(int format, String input)
	{
		this();
		this.format = format;
		
		StringReader strReader = new StringReader(input);
		
		if (format == INTERNAL_FORMAT || format == TEXT_FORMAT || format == CONLLU_FORMAT) {
			txtPosReader = new TXTPosReader(strReader);
			textReader = new BufferedReader(txtPosReader);
		}
		else {
			XMLInputFactory inputFactory = XMLInputFactory.newInstance();
			
			inputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
			xmlPosReader = new XMLPosReader(strReader);
			try {
				xmlReader = inputFactory.createXMLEventReader(xmlPosReader);
			} 
			catch (XMLStreamException e) {
				e.printStackTrace();
			}
		}
	}
	
	public void setCurrentFile(File file)
	{
		currentFile = file;
		processedFileName = file.getName();
	}
	
	public void setCurrentFile(File file, int format)
	{
		currentFile = file;
		this.format = format;
	}
	
	public String getCurrentFileName()
	{
		if (currentFile != null) return currentFile.getName();
		return null;
	}
	
	public String getProcessedFileName()
	{
		return processedFileName;
	}
	
	public int getFormat()
	{
		return format;
	}
	
	public Token getPreviousToken()
	{
		return previous;
	}
	
	public int getDocNumber()
	{
		return docNumber;
	}
	
	public void initMultiWordTokens()
	{
		mwTokens = null;
	}
	
	public LinkedList<MultiWordToken> getMultiWordTokens()
	{
		return mwTokens;
	}
	
	public long getSentenceStart()
	{
		return sentenceStart;
	}
	
	public int getSentenceLength()
	{
		return (int)(sentenceEnd - sentenceStart);
	}
	
	public void openFile()
	{
		try {
			switch (format) {
				case INTERNAL_FORMAT :
				case TEXT_FORMAT :
				case CONLLU_FORMAT : openTextFile(); break;
				case XCES_FORMAT :
				case TEI_FORMAT  : openXMLFile();
			}
			docNumber++;
			sentenceStart = sentenceEnd = 0;
			end_char = -1;
			eos = false;
		}
		catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	public void closeFile()
	{
		try {
			switch (format) {
				case INTERNAL_FORMAT :
				case TEXT_FORMAT :
				case CONLLU_FORMAT : textReader.close(); break;
				case XCES_FORMAT : 
				case TEI_FORMAT  : xmlReader.close();
			}
		}
		catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	public void openTextFile() throws UnsupportedEncodingException, FileNotFoundException
	{
		txtPosReader = new TXTPosReader(
				new InputStreamReader(new FileInputStream(currentFile), "UTF8"));
		textReader = new BufferedReader(txtPosReader);
	}
	
	public void openXMLFile() throws FileNotFoundException, XMLStreamException, UnsupportedEncodingException
	{
		XMLInputFactory inputFactory = XMLInputFactory.newInstance();
		xmlPosReader = new XMLPosReader(
				new InputStreamReader(new FileInputStream(currentFile), "UTF8"));
		
		xmlReader = inputFactory.createXMLEventReader(xmlPosReader);
	}
	
	public void checkFormat()
	{
		try {
			if (currentFile.getName().toLowerCase().endsWith(".tgt")) format = INTERNAL_FORMAT;
			else if (currentFile.getName().toLowerCase().endsWith(".ccl")) format = XCES_FORMAT;
			else if (currentFile.getName().toLowerCase().endsWith(".conllu")) format = CONLLU_FORMAT;
			else if (checkFormatTXT()) format = TEXT_FORMAT;
			else if (checkFormatXCES()) format = XCES_FORMAT;
			else if (checkFormatTEI()) format = TEI_FORMAT;
			else format = UNKNOWN_FORMAT;
		}
		catch (IOException e) {
			e.printStackTrace();
			format = UNKNOWN_FORMAT;
		}
	}
	
	public boolean checkFormatTXT() throws IOException
	{
		String line;
		boolean ok = true;
		int index, count = 0;
		BufferedReader reader;
		
		reader = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), "UTF8"));
		do {
			line = reader.readLine();
			if (line == null) break;
			line = line.trim();
			if (!line.startsWith("%%")) {
				if (line.isEmpty() || line.matches("&\\s*#&\\s*#interp#") || line.matches("&\\t&\\tinterp.*")) count++;
				else {
					index = 0;
					for (int i = 0; i < 3; i++) {
						index = line.indexOf('#', index);
						if (index < 0) {
							ok = false;
							break;
						}
						index++;
					}
					if (ok) ok = (index == line.length());
					else {
						ok = true;
						index = 0;
						for (int i = 0; i < 2; i++) {
							index = line.indexOf('\t', index);
							if (index < 0) {
								ok = false;
								break;
							}
							index++;
						}
					}
					if (ok) count++;
				}
				if (count >= 10) break;
			}
		} while (true);
		reader.close();
		return ok;
	}
	
	public boolean checkFormatXCES() throws IOException
	{
		return checkXML("<cesAna");
	}
	
	public boolean checkFormatTEI() throws IOException
	{
		return checkXML("<teiCorpus");
	}
	
	public boolean checkXML(String str) throws IOException
	{
		int count;
		char buff[];
		BufferedReader reader;
		
		try {
			reader = new BufferedReader(new InputStreamReader(new FileInputStream(currentFile), "UTF8"));
			buff = new char[512];
			count = reader.read(buff);
			reader.close();
			if (count >= 0) {
				String fragment = new String(buff, 0, count);
				Pattern pattern = Pattern.compile("<\\?xml.+" + str, Pattern.DOTALL);
				Matcher matcher = pattern.matcher(fragment);
				
				if (matcher.find()) return true;
			}
		}
		catch (IOException e) {
			e.printStackTrace();
		}
		return false;
	}
	
	public Token getNextToken()
	{
		previous = current;
		if (previous == Token.nullToken) previous = null; 
		switch (format) {
			case INTERNAL_FORMAT :
				current = getTokenFromTGTFile();
				break;
			case TEXT_FORMAT : 
				current = getTokenFromTextFile();
				break;
			case XCES_FORMAT : 
				current = getTokenFromXCESFile();
				break;
			case TEI_FORMAT : 
				current = getTokenFromTEIFile();
				break;
			case CONLLU_FORMAT :
				current = getTokenFromCONLLUFile();
				break;
			default : 
				current = null;
		}
		return current;
	}
	
	public Token getTokenFromTGTFile()
	{
		String line;
		
		try {
			if (eos) {
				sentenceStart = txtPosReader.getPos();
				eos = false;
			}
			line = textReader.readLine();
			while (line != null) {
				line = line.trim();
				if (!line.isEmpty()) {
					if (line.equals("eos")) {
						eos = true;
						sentenceEnd = txtPosReader.getPos();
						return Token.nullToken;
					}
					else {
						String[] cols = line.split("\\t");
						
						if (cols.length >= 3) {
							if (cols.length >= 4 && cols[3].equals("nps"))
								if (previous != null) previous.spaceAfter = false;	
							return new Token(cols[0], cols[1], cols[2]);
						}
					}
				}
				line = textReader.readLine();
			}
			eos = true;
			sentenceEnd = txtPosReader.getPos();
			return null;
		}
		catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public Token getTokenFromTextFile()
	{
		String line;
		
		try {
			if (eos) {
				sentenceStart = txtPosReader.getPos();
				eos = false;
			}
			line = textReader.readLine();
			while (line != null) {
				line = line.trim();
				if (!line.startsWith("%%")) {
					if (line.isEmpty() || line.matches("&\\s*#&\\s*#interp#") || line.matches("&\\t&\\tinterp.*")) {
						eos = true;
						sentenceEnd = txtPosReader.getPos();
						return Token.nullToken;
					}
					else {
						Matcher matcher = patt.matcher(line);
						
						if (matcher.matches()) {
							return new Token(matcher.group(1).trim(), 
											 matcher.group(2).trim(), 
											 matcher.group(3).trim());
						}
						else {
							String[] cols = line.split("\\t");
							
							if (cols.length >= 3) return new Token(cols[0], cols[1], cols[2]);
						}
					}
				}
				else {
					previous = null;
					docNumber++;
					sentenceStart = sentenceEnd = 0;
				}
				line = textReader.readLine();
			}
			eos = true;
			sentenceEnd = txtPosReader.getPos();
			return null;
		}
		catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public Token getTokenFromCONLLUFile()
	{
		String line;
		boolean spaceAfter = true;
		
		try {
			if (eos) {
				sentenceStart = txtPosReader.getPos();
				eos = false;
			}
			line = textReader.readLine();
			while (line != null) {
				line = line.trim();
				if (line.isEmpty()) {
					eos = true;
					sentenceEnd = txtPosReader.getPos();
					return Token.nullToken;
				}
				else {
					if (line.matches("\\d+.*")) {
						String[] cols = line.split("\\t");
						
						if (cols[9].contains("SpaceAfter=No")) spaceAfter = false;
						else {
							Pattern p = Pattern.compile("start_char=(\\d+)\\|end_char=(\\d+)");
							Matcher m = p.matcher(cols[9]);
							
							if (m.find()) {
								int s = Integer.parseInt(m.group(1));
								int e = Integer.parseInt(m.group(2));
								
								if (s == end_char) {
									if (previous != null) previous.spaceAfter = false;
								}
								end_char = e;
							}
						}
						if (cols[0].matches("\\d+\\-\\d+")) {
							String[] r = cols[0].split("\\-");
							
							mwt = new MultiWordToken(cols[1], spaceAfter, Integer.parseInt(r[0]), Integer.parseInt(r[1]));
							if (mwTokens == null) mwTokens = new LinkedList<MultiWordToken>();
							mwTokens.add(mwt);
						}
						else if (cols[0].matches("\\d+")) {
							UDToken t = new UDToken(cols[1], cols[2], cols[4], cols[3], cols[5].toLowerCase(), cols[7], 
								Integer.parseInt(cols[0]), Integer.parseInt(cols[6]), spaceAfter);
							
							if (mwt != null) {
								if (t.index >= mwt.startToken() && t.index <= mwt.endToken()) mwt.addToken(t);
								else mwt = null;
							}
							if (cols[9].contains("ner=S")) t.ner= 'S';
							else if (cols[9].contains("ner=B")) t.ner = 'B';
							else if (cols[9].contains("ner=I")) t.ner = 'I';
							else if (cols[9].contains("ner=E")) t.ner = 'E';
							return t;
						}
					}
					else {
						if (line.contains("newdoc")) {
							previous = null;
							mwt = null;
							docNumber++;
							sentenceStart = sentenceEnd = 0;
						}
					}
				}
				line = textReader.readLine();
			}
			eos = true;
			sentenceEnd = txtPosReader.getPos();
			return null;
		}
		catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	
	public Token getTokenFromXCESFile()
	{
		String form, lemma, ctag, UDPos, UDTag, UDRel, position, ner;
		int distance = 0;
		boolean disamb = false, morph = false;
		
		form = lemma = ctag = UDPos = UDTag = UDRel = position = ner = null;
		while (xmlReader.hasNext()) {
			try {
				XMLEvent event = xmlReader.nextEvent();
				StartElement startElement;
				EndElement endElement;
				String name;
				
				if (event.isStartElement()) {
			        startElement = event.asStartElement();
			        name = startElement.getName().getLocalPart();
			        if (name.equals("chunk")) {
						@SuppressWarnings("unchecked")
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("type")) {
								if (attr.getValue().equals("s")) {
									sentenceStart = xmlPosReader.getPos();
									previous = null;
									break;
								}
							}
						}
			        }
			        else if (name.equals("sentence")) {
			        	sentenceStart = xmlPosReader.getPos();
			        	tokenIndex = 0;
			        	previous = null;
			        }
			        else if (name.equals("tok")) {
						form = lemma = ctag = UDPos = UDTag = UDRel = position = ner = null;
						distance = 0;
						tokenIndex++;
			        }
					else if (name.equals("orth")) {
						form = xmlReader.getElementText();
					}
					else if (name.equals("lex")) {
						@SuppressWarnings("unchecked")
						Iterator<Attribute> it = startElement.getAttributes();
						
						disamb = false;
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("disamb")) {
								if (attr.getValue().equals("1")) {
									disamb = true;
									break;
								}
							}
						}
					}
					else if (name.equals("base")) {
						if (disamb) {
							lemma = xmlReader.getElementText();
							
							if (lemma.contains(":") && !lemma.equals(":")) lemma = lemma.split(":")[0];
						}
					}
					else if (name.equals("ctag")) {
						if (disamb) ctag = xmlReader.getElementText();
					}
					else if (name.equals("upos")) {
						UDPos = xmlReader.getElementText();
					}
					else if (name.equals("deprel")) {
						UDRel = xmlReader.getElementText();
					}
					else if (name.equals("xpos")) {
						ctag = xmlReader.getElementText();
					}
					else if (name.equals("lemma")) {
						lemma = xmlReader.getElementText();
						
						if (lemma.contains(":") && !lemma.equals(":")) lemma = lemma.split(":")[0];
					}
					else if (name.equals("head.distance")) {
						distance = Integer.valueOf(xmlReader.getElementText());
					}
					else if (name.equals("head.position")) {
						position = xmlReader.getElementText();
					}
					else if (name.equals("morph")) {
						morph = true;
					}
					else if (name.equals("ner")) {
						@SuppressWarnings("unchecked")
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("mark")) {
								ner = attr.getValue();
							}
						}
					}
					else {
						if (morph) {
							String val = name + "=" + xmlReader.getElementText();
							
							if (UDTag != null) UDTag += "|" + val;
							else UDTag = val;
						}
					}
				}
				else if (event.isEndElement()) {
					endElement = event.asEndElement();
					name = endElement.getName().getLocalPart();
					if (name.equals("chunk") || name.equals("sentence")) {
						sentenceEnd = xmlPosReader.getMark();
						return Token.nullToken;
					}
					else if (name.equals("tok")) {
						boolean createToken = false, createUDToken = false;
						
						disamb = false;
						if (form != null && lemma != null && ctag != null) {
							createToken = true;
							if (UDPos != null && UDRel != null && position != null && distance > 0) {
								createUDToken = true;
							}
						}
						if (createUDToken) {
							int UDLink = tokenIndex;
							
							if (position.equals("right")) UDLink += distance;
							else UDLink -= distance;
							
							UDToken t = new UDToken(form, lemma, ctag, UDPos, UDTag, UDRel, tokenIndex, UDLink);
							
							if (ner != null) t.ner = ner.charAt(0);
							return t;
						}
						if (createToken) {
							Token t = new Token(form, lemma, ctag);
							
							if (ner != null) t.ner = ner.charAt(0);
							return t;
						}
					}
					else if (name.equals("lex")) {
						disamb = false;
					}
					else if (name.equals("morph")) {
						morph = false;
					}
					else if (name.equals("ns")) {
						if (previous != null) previous.spaceAfter = false;
					}
				}
			} 
			catch (XMLStreamException e) {
				e.printStackTrace();
				break;
			}
		}
		return null;
	}
	
	@SuppressWarnings("unchecked")
	public Token getTokenFromTEIFile()
	{
		while (xmlReader.hasNext()) {
			try {
				XMLEvent event = xmlReader.nextEvent();
				
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					String name = startElement.getName().getLocalPart();
					if (name.equals("s")) {
						sentenceStart = xmlPosReader.getPos();
						previous = null;
					}
					else if (name.equals("fs")) {
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("type")) {
								if (attr.getValue().equals("morph")) {
									Token token = new Token();
									
									getMorph(token);
									return token;
								}
							}
						}
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					String name = endElement.getName().getLocalPart();
					if (name.equals("s")) {
						sentenceEnd = xmlPosReader.getMark();
						return Token.nullToken;
					}
				}
			}
			catch (XMLStreamException e) {
				e.printStackTrace();
				return null;
			}
		}
		return null;
	}
	
	@SuppressWarnings("unchecked")
	public void getMorph(Token token)
	{
		XMLEvent event;
		boolean done = false;
		boolean nps = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
			        String name = startElement.getName().getLocalPart();
			        
			        if (name.equals("f")) {
			        	Iterator<Attribute> it = startElement.getAttributes();
			        	
			        	while (it.hasNext()) {
			        		Attribute attr = it.next();
			        		
			        		if (attr.getName().getLocalPart().equals("name")) {
			        			String value = attr.getValue();
			        			
			        			if (value.equals("orth")) getOrth(token);
				        		else if (value.equals("interps")) getInterps(token);
				        		else if (value.equals("nps")) nps = true;
				        		else if (value.equals("disamb")) {
				        			getDisamb(token);
				        			done = true;
				        			break;
				        		}
			        		}
			        	}
			        }
			        else if (name.equals("binary")) {
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("value")) {
								if (attr.getValue().equals("true") && nps) {
									nps = false;
									if (previous != null) previous.spaceAfter = false;
								}
							}
						}

			        }
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("fs")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	public void getOrth(Token token) 
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					
					if (startElement.getName().getLocalPart().equals("string")) {
						token.form = xmlReader.getElementText();
						done = true;
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	@SuppressWarnings("unchecked")
	public void getInterps(Token token) 
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					if (startElement.getName().getLocalPart().equals("fs")) {
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("type")) {
								if (attr.getValue().equals("lex")) {
									getLex(token);
									break;
								}
							}
						}
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	@SuppressWarnings("unchecked")
	public void getLex(Token token)
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					
					if (startElement.getName().getLocalPart().equals("f")) {
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("name")) {
								String value = attr.getValue();
								
								if (value.equals("base")) {
									getBase(token);
									break;
								}
								else if (value.equals("ctag")) {
									getCTag(token);
								}
								else if (value.equals("msd")) {
									getMSD(token);
									break;
								}
							}
						}
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("fs")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	public void getBase(Token token) 
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					
					if (startElement.getName().getLocalPart().equals("string")) {
						token.lemma = xmlReader.getElementText();
						done = true;
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	@SuppressWarnings("unchecked")
	public void getCTag(Token token) 
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					
					if (startElement.getName().getLocalPart().equals("symbol")) {
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("value")) {
								token.ctag = attr.getValue();
								done = true;
								break;
							}
						}
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	@SuppressWarnings("unchecked")
	public void getMSD(Token token)
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					
					if (startElement.getName().getLocalPart().equals("symbol")) {
						Iterator<Attribute> it = startElement.getAttributes();
						
						while (it.hasNext()) {
							Attribute attr = it.next();
							
							if (attr.getName().getLocalPart().equals("value")) {
								token.ctag += ":" + attr.getValue();
								done = true;
								break;
							}
						}
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	@SuppressWarnings("unchecked")
	public void getDisamb(Token token) 
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					if (startElement.getName().getLocalPart().equals("fs")) {
						do {
							event = xmlReader.nextEvent();
							if (event.isStartElement()) {
								startElement = event.asStartElement();
								if (startElement.getName().getLocalPart().equals("f")) {
									Iterator<Attribute> it = startElement.getAttributes();
									
									while (it.hasNext()) {
										Attribute attr = it.next();
										
										if (attr.getName().getLocalPart().equals("name")) {
											if (attr.getValue().equals("interpretation")) {
												getInterpretation(token);
												done = true;
												break;
											}
										}
									}
								}
							}
							else if (event.isEndElement()) {
								EndElement endElement = event.asEndElement();
								
								if (endElement.getName().getLocalPart().equals("fs")) done = true;
							}
						} while (!done);
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	public void getInterpretation(Token token)
	{
		XMLEvent event;
		boolean done = false;
		
		try {
			do {
				event = xmlReader.nextEvent();
				if (event.isStartElement()) {
					StartElement startElement = event.asStartElement();
					if (startElement.getName().getLocalPart().equals("string"))	{
						String str = xmlReader.getElementText();
						
						if (str.startsWith(":")) {
							token.lemma = ":";
							token.ctag = str.substring(2);
						}
						else {
							int index = str.indexOf(":");
							
							token.lemma = str.substring(0, index);
							token.ctag = str.substring(index + 1);
						}
						done = true;
					}
				}
				else if (event.isEndElement()) {
					EndElement endElement = event.asEndElement();
					
					if (endElement.getName().getLocalPart().equals("f")) done = true;
				}
			} while (!done);
		}
		catch (XMLStreamException e) {
			e.printStackTrace();
		}
	}
	
	public static Pair<LinkedList<Token>, LinkedList<MultiWordToken>> getSentence(RandomAccessFile file, int fileFormat, long start, int len)
	{
		byte[] b = new byte[len];
		String str = null;
		CorpusReader reader;
		
		try {
			file.seek(start);
			file.read(b);
			str = new String(b, "UTF8").trim();
		}
		catch (Exception e) {
			e.printStackTrace();
		}
		if (fileFormat == XCES_FORMAT) str = "<sentence>" + str + "</sentence>";
		else if (fileFormat == TEI_FORMAT) str = "<s>" + str + "</s>";
		reader = new CorpusReader(fileFormat, str);
		return new Pair<LinkedList<Token>, LinkedList<MultiWordToken>>(loadTokens(reader), reader.getMultiWordTokens());
	}
	
	public static LinkedList<Token> loadTokens(CorpusReader reader)
	{
		Token t;
		LinkedList<Token> tokenList = new LinkedList<Token>();
		
		t = reader.getNextToken();
		
		while (t != null && !t.stop()) {
			tokenList.add(t);
			t = reader.getNextToken();
		}
		if (tokenList.isEmpty()) return null;
		return tokenList;
	}
	
	public static LinkedList<Token> replaceMWT(LinkedList<? extends Token> phrase, LinkedList<MultiWordToken> mwtList)
	{
		LinkedList<Token> repl = new LinkedList<Token>();
		
		repl.addAll(phrase);
		for (MultiWordToken mwt : mwtList) {
			Token s = mwt.getTokens().getFirst();
			Token e = mwt.getTokens().getLast();
			ListIterator<Token> it = repl.listIterator();
			Token t = null;
			boolean remove = false;
			
			while (it.hasNext() && t != e) {
				t = it.next();
				if (t == s) remove = true;
				if (t == e) it.set(mwt);
				else if (remove) it.remove();
			}
		}
		return repl;
	}
	
	private class TXTPosReader extends Reader
	{
		
	    private Reader internalReader;
	    private long pos = 0;
	    private boolean surrogate = false;
	    
	    public TXTPosReader(Reader internalReader) 
	    {
	        this.internalReader = internalReader;
	    }

	    public long getPos() 
	    {
	        return pos;
	    }
	    
		@Override
		public int read(char[] cbuf, int off, int len) throws IOException 
		{
	        int chars_read = internalReader.read(cbuf, off, 1);
	        int nbytes = 0;
	        
	        if (chars_read > 0) {
	        	if (surrogate) surrogate = false;
	        	else {
		        	char ch = cbuf[off];
		        	
		        	if (ch <= 0x7F) nbytes = 1;
		        	else if (ch <= 0x7FF) nbytes = 2;
		        	else if (Character.isSurrogate(ch)) {
		        		surrogate = true;
		        		nbytes = 4;
		        	}
		        	else nbytes = 3;
	        	}
	        }
	        pos += nbytes;
	        return chars_read;
		}

		@Override
		public void close() throws IOException 
		{
	        internalReader.close();
		}
		
	}
	
	private class XMLPosReader extends TXTPosReader 
	{
		
	    private long mark = 0; 
	    
	    public XMLPosReader(Reader internalReader) 
	    {
	        super(internalReader);
	    }
	    
	    public long getMark()
	    {
	    	return mark;
	    }
	    
		@Override
		public int read(char[] cbuf, int off, int len) throws IOException 
		{
	        int chars_read = super.read(cbuf, off, len);
	        
	        if (chars_read > 0) {
	        	if (cbuf[off] == '<') mark = getPos() - 1;
	        }
	        return chars_read;
		}
		
	}

}
