package termopl;

import java.io.*;
import java.util.*;

import javax.swing.JOptionPane;

public class ExtractorEngine extends Thread
{

	public static final String[] HEAD_POS = {"NOUN", "PROPN"};
	public static final String[] NON_HEAD_POS = {"ADJ", "ADP", "ADV", "DET", "NUM", "SCONJ"};
	public static final String[] OBLIGATORY_REL = {"case", "case:poss", "ccomp", "compound", "compound:prt", "det",
		"expl:pv", "fixed", "flat", "iobj", "obj", "amod:flat", "nmod:arg", "nmod:flat", 
		"obl:agent", "obl:arg", "xcomp"};
	public static final String[] FACULTATIVE_REL = {"acl", "advmod", "advmod:emph", "amod", "appos", "nmod", 
		"nmod:poss", "nummod", "nummod:gov", "obl"};
	public static final String[] HEAD_PHRASE_REL = {"amod", "amod:flat", "nummod"};
	public static final int MAX_TERM_LEN = 6;
	
	private TermoPLDocument doc;
	private Tagset tagset;
	private CorpusReader corpusReader;
	private SentenceParser parser;
	private BigramComp bigramComparator;
	private File[] searchFiles;
	private Set<String> stopWords;
	private Set<String> commonTerms;
	private Template compPreps;
	private TermMatcher matcher;
	private TermMatcher compprepMatcher;
	private LinkedList<FileDescr> analyzedFiles;
	private LinkedList<MatchedFragment> maxTerms;
	private HashMap<String, Term> termMap;
	private HashMap<String, Term> waitingMap;
	private HashMap<String, Integer> unigrams;
	private HashMap<Bigram, Float> bigrams;
	private Term[] terms;
	private int ntok;
	private int nbigrams;
	private int nsent;
	private int fileID;
	private int acceptDET;
	private int nouns;
	private int dets;
	private int maxTermLength; // UD only
	private boolean checkDetRatio;
	private boolean cancelled;
	private boolean error;
	private boolean allowDiscontinuities;
	/***********************************/
	//private PrintWriter annotation;
	
	public ExtractorEngine(TermoPLDocument doc, File[] searchFiles)
	{
		this.doc = doc;
		tagset = doc.getPreferences().getTagset();
		corpusReader = new CorpusReader();
		parser = null;
		this.searchFiles = searchFiles;
		stopWords = doc.getStopWords();
		compPreps = doc.getCompoundPrepositions();
		commonTerms = doc.getCommonTerms();
		if (doc.getPreferences().useNPMIMethod) bigramComparator = new BigramComp();
		else bigramComparator = null;
		termMap = null;
		allowDiscontinuities = false;
	}
	
	public void run()
	{
		/*
		try {
			annotation = new PrintWriter("annotation.txt", "UTF8");
		} catch (FileNotFoundException | UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}*/
		acceptDET = doc.acceptDET();
		checkDetRatio = false;
		cancelled = false;
		error = false;
		nouns = 0;
		dets = 0;
		maxTermLength = MAX_TERM_LEN;
		ntok = nsent = nbigrams = 0;
		fileID = 0;
		terms = doc.getTerms();
		matcher = new TermMatcher(doc.getSearchTemplate(), tagset);
		if (compPreps != null) 
			compprepMatcher = new TermMatcher(doc.getCompoundPrepositions(), tagset);
		else compprepMatcher = null;
		maxTerms = new LinkedList<MatchedFragment>();
		if (doc.getPreferences().useNPMIMethod) {
			unigrams = new HashMap<String, Integer>(100000);
			bigrams = new HashMap<Bigram, Float>(400000);
		}
		else {
			unigrams = null;
			bigrams = null;
		}
		terms = null;
		if (doc.getPreferences().useUD) doc.changeProgress(2);
		else doc.changeProgress(1);
		termMap = new HashMap<String, Term>();
		waitingMap = new HashMap<String, Term>();
		preprocessFiles();
		if (!cancelled && !error) processFiles();
		if (error) {
			doc.interrupt();
			if (TermoPL.batchMode) System.err.println("Incompatible data format with the selected search method.");
			else JOptionPane.showMessageDialog(TermoPL.dialogOwner, "Incompatible file format with the selected search method.", "Error", JOptionPane.ERROR_MESSAGE);
		}
		if (cancelled || error) {
			termMap = null;
			waitingMap = null;
			maxTerms = null;
			unigrams = null;
			bigrams = null;
			terms = null;
		}
		else {
			if (doc.getPreferences().useNPMIMethod) calcNPMI();
			doc.changeProgress(4);
			if (doc.getTerms() != null) {
				boolean bf = doc.getPreferences().calculateBaseForms;
				
				for (Term t : doc.getTerms()) {
					String key = (bf ? doc.calcSimplifiedForm(t) : t.str);
					
					termMap.put(key, t);
				}
			}
			collectTerms();
			maxTerms = null;
			unigrams = null;
			bigrams = null;
			if (termMap.size() > 0)
				terms = termMap.values().toArray(new Term[0]);
			waitingMap = null;
			System.gc();
		}
//		annotation.flush();
//		annotation.close();
	}
	
	public void collectTerms()
	{
		int count = 0, count1 = 0, count2 = 0, len = maxTerms.size();
	
		report(0, 0.0F);
		while (!maxTerms.isEmpty()) {
			MatchedFragment mf = maxTerms.removeFirst();
			SentenceRef sr = mf.getRef();
			int docID = mf.getDocID();
			
			if (cancelled) break;
			
			LinkedList<MatchedToken> tokens = mf.getMatchedTokens();
			int s = tokens.size();
			
			collectTermCandidate(tokens, docID, sr, null, null);
			
			if (s > 1) {
				LinkedList<Token> source = mf.getTokens();
				Range r = new Range(0, s - 1);
				Preferences prefs = doc.getPreferences();
	
				if (prefs.useNPMIMethod) {
					Bigram[] bgs = new Bigram[s - 1];
					Token t1 = null;
					int i = 0;
					
					for (MatchedToken mt : tokens) {
						Token t2 = mt.token;
						
						if (t1 != null) bgs[i] = new Bigram(t1.lemma, t2.lemma, ++i);
						t1 = t2;
					}
					if (bgs.length > 1) Arrays.sort(bgs, bigramComparator);
					switch (prefs.NPMIMethod) {
						case 1 : collect_NPMI1(tokens, source, docID, sr, bgs, r); break;
						case 2 : collect_NPMI2(tokens, source, docID, sr, bgs, r); break;
						case 3 : collect_NPMI3(tokens, source, docID, sr, bgs, r);
					}
				}
				else if (prefs.trimFromLeftToRight) {
					collectLR(tokens, docID, sr);
				}
				else {
					LinkedList<Range> ranges = new LinkedList<Range>();
					
					collect(tokens, source, docID, sr, r, ranges);
				}
			}
			count++;
			if (++count1 >= 1000) {
				count1 = 0;
				if (++count2 >= 25000) {
					count2 = 0;
					System.gc();
				}
				report(termMap.size(), (float)count / (float)len);
			}
		}
		report(termMap.size(), 1.0F);
	}
	
	public KeyLen getKeyFromMatchedTokens(LinkedList<MatchedToken> tokenList)
	{
		StringBuffer buffer = new StringBuffer();
		int len = 0;
		ListIterator<MatchedToken> it = tokenList.listIterator();
		Token prev = null;
		
		while (it.hasNext()) {
			Token t = it.next().token;
			
			if (prev != null) {
				if (prev.spaceAfter) {
					buffer.append(" " + t.lemma);
					len++;
				}
				else buffer.append(t.lemma);
			}
			else {
				buffer.append(t.lemma);
				len++;
			}
			prev = t;
		}
		return new KeyLen(buffer.toString(), len);
	}
	
	public KeyLen getKeyFromTokens(LinkedList<? extends Token> ph)
	{
		StringBuffer buffer = new StringBuffer();
		int len = 0;
		int prevIndex = -1;
		ListIterator<? extends Token> it = ph.listIterator();
		Token prev = null;
		
		while (it.hasNext()) {
			Token t = it.next();
			int index;	
			
			if (t instanceof UDToken) index = ((UDToken) t).index;
			else index = ((MultiWordToken) t).endToken();
			if (prev != null) {
				if (prev.spaceAfter || index > prevIndex + 1) {
					buffer.append(" ");
					buffer.append(t.lemma);
					len++;
				}
				else buffer.append(t.lemma);
			}
			else {
				buffer.append(t.lemma);
				len++;
			}
			prev = t;
			prevIndex = index;
		}
		return new KeyLen(buffer.toString(), len);
	}
	
	public void collectTermCandidate(LinkedList<MatchedToken> tokenList, int docID, SentenceRef sr, String leftContext, String rightContext)
	{
		for (MatchedToken mt : tokenList) {
			mt.parent = null;
		}
		
		KeyLen keylen = getKeyFromMatchedTokens(tokenList);
		Preferences prefs = doc.getPreferences();
		
		if (prefs.ignoreCase) keylen.key = keylen.key.toLowerCase();
		if (commonTerms == null || !commonTerms.contains(keylen.key)) {
			Term term = termMap.get(keylen.key);
			boolean inner = (leftContext != null || rightContext != null);
			
			if (term == null) {
				if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
				else term = new Term(keylen.key, keylen.len);
				termMap.put(keylen.key, term);
			}
			term.addContext(leftContext, rightContext);
			term.addForm(Form.createFromMatchedTokens(tokenList), docID, inner, prefs.collectAllForms);
			if (prefs.makeIndex) term.addSentenceRef(sr);
		}
	}
	
	public boolean cutRight(Range r_in, Range r_out, LinkedList<MatchedToken> tokenList, LinkedList<MatchedToken> removedList)
	{
		MatchedToken mt;
		boolean found = false;
		
		while (!found && tokenList.size() > 1) {
			r_out.right--;
			mt = tokenList.removeLast();
			removedList.addFirst(mt);
			if (mt.token.ner != 'E' && mt.token.ner != 'I') {
				mt = tokenList.getLast();
				if (mt.token.spaceAfter) found = true;
			}
		}
		return found;
	}
	
	public boolean cutLeft(Range r_in, Range r_out, LinkedList<MatchedToken> tokenList, LinkedList<MatchedToken> removedList)
	{
		MatchedToken mt;
		boolean found = false;
		
		while (!found && tokenList.size() > 1) {
			r_out.left++;
			mt = tokenList.removeFirst();
			removedList.add(mt);
			if (mt.token.spaceAfter && mt.token.ner != 'B' && mt.token.ner != 'I') found = true;
		}
		return found;
	}
	
	public boolean cutLeft(LinkedList<MatchedToken> tokenList, LinkedList<MatchedToken> removedList)
	{
		MatchedToken mt;
		boolean found = false;
		
		while (!found && tokenList.size() > 1) {
			mt = tokenList.removeFirst();
			removedList.add(mt);
			if (mt.token.spaceAfter && mt.token.ner != 'B' && mt.token.ner != 'I') found = true;
		}
		return found;
	}
	
	public int cut(Range r, Bigram[] bgs, LinkedList<MatchedToken> tokenList, LinkedList<MatchedToken> phr1, LinkedList<MatchedToken> phr2)
	{
		int div = 0, pos;
		boolean found = false;
		
		for (pos = 0; pos < bgs.length; pos++) {
			div = bgs[pos].div;
			if (div > r.left && div <= r.right) {
				Token t = tokenList.get(div - 1).token;
				
				if (t.ner != 'B' && t.ner != 'I' && t.spaceAfter) {
					found = true;
					break;
				}
			}
		}
		if (found) {
			pos = 0;
			for (MatchedToken mt : tokenList) {
				if (pos > r.right) break;
				else if (pos >= r.left) {
					if (pos < div) phr1.add(mt);
					else phr2.add(mt);
				}
				pos++;
			}
		}
		else div = 0;
		return div;
	}
	
	public void collect(LinkedList<MatchedToken> tokenList, LinkedList<Token> source, int docID, SentenceRef sr, Range r, LinkedList<Range> ranges)
	{
		ranges.add(r);
		if (tokenList.size() > 1) {
			TermMatcher tm = new TermMatcher(doc.getSearchTemplate(), tagset);
			LinkedList<MatchedToken> removedList = new LinkedList<MatchedToken>();
			String leftContext, rightContext;
			Range rg = new Range(r.left, r.right);
			int s = source.size();
			
			if (cutRight(r, rg, tokenList, removedList)) {
				if (!ranges.contains(rg)) {
					tm.setMatchedTokens(tokenList);
					if (tm.match()) {
						if (rg.left <= 0) leftContext = null;
						else leftContext = source.get(rg.left - 1).lemma;
						if (rg.right >= s - 1) rightContext = null;
						else rightContext = source.get(rg.right + 1).lemma;
						collectTermCandidate(tm.getMatchedFragment().getMatchedTokens(), docID, sr, leftContext, rightContext);
					}
					collect(tokenList, source, docID, sr, rg, ranges);
				}
			}
			tokenList.addAll(removedList);
			removedList.clear();
			rg.left = r.left;
			rg.right = r.right;
			if (cutLeft(r, rg, tokenList, removedList)) {
				if (!ranges.contains(rg)) {
					tm.setMatchedTokens(tokenList);
					if (tm.match()) {
						if (rg.left <= 0) leftContext = null;
						else leftContext = source.get(rg.left - 1).lemma;
						if (rg.right >= s - 1) rightContext = null;
						else rightContext = source.get(rg.right + 1).lemma;
						collectTermCandidate(tm.getMatchedFragment().getMatchedTokens(), docID, sr, leftContext, rightContext);
					}
					collect(tokenList, source, docID, sr, rg, ranges);
				}
			}
			tokenList.addAll(0, removedList);
		}
	}
	
	public void collectLR(LinkedList<MatchedToken> tokenList, int docID, SentenceRef sr)
	{
		if (tokenList.size() > 1) {
			TermMatcher tm = new TermMatcher(doc.getSearchTemplate(), tagset);
			LinkedList<MatchedToken> removedList = new LinkedList<MatchedToken>();
			
			if (cutLeft(tokenList, removedList)) {
				tm.setMatchedTokens(tokenList);
				if (tm.match()) {
					collectTermCandidate(tm.getMatchedFragment().getMatchedTokens(), docID, sr, removedList.getLast().token.lemma, null);
				}
				collectLR(tokenList, docID, sr);
			}
			tokenList.addAll(0, removedList);
		}
	}
	
	public void collect_NPMI1(LinkedList<MatchedToken> tokenList, LinkedList<Token> source, int docID, SentenceRef sr, Bigram[] bgs, Range r)
	{
		if (r.left < r.right) {
			LinkedList<MatchedToken> phr1 = new LinkedList<MatchedToken>();
			LinkedList<MatchedToken> phr2 = new LinkedList<MatchedToken>();
			int div = cut(r, bgs, tokenList, phr1, phr2);
			
			if (div > 0) {
				TermMatcher tm = new TermMatcher(doc.getSearchTemplate(), tagset);
				Range rg = new Range(r.left, div - 1);
				String leftContext, rightContext;
				int s = source.size();
				
				tm.setMatchedTokens(phr1);
				if (tm.match()) {
					if (rg.left <= 0) leftContext = null;
					else leftContext = source.get(rg.left - 1).lemma;
					if (rg.right >= s - 1) rightContext = null;
					else rightContext = source.get(rg.right + 1).lemma;
					phr1 = tm.getMatchedFragment().getMatchedTokens();
					collectTermCandidate(phr1, docID, sr, leftContext, rightContext);
				}
				collect_NPMI1(tokenList, source, docID, sr, bgs, rg);
				rg = new Range(div, r.right);
				tm.setMatchedTokens(phr2);
				if (tm.match()) {
					if (rg.left <= 0) leftContext = null;
					else leftContext = source.get(rg.left - 1).lemma;
					if (rg.right >= s - 1) rightContext = null;
					else rightContext = source.get(rg.right + 1).lemma;
					phr2 = tm.getMatchedFragment().getMatchedTokens();
					collectTermCandidate(phr2, docID, sr, leftContext, rightContext);
				}
				collect_NPMI1(tokenList, source, docID, sr, bgs, rg);
			}
		}
	}
	
	public void collect_NPMI2(LinkedList<MatchedToken> tokenList, LinkedList<Token> source, int docID, SentenceRef sr, Bigram[] bgs, Range r)
	{
		if (r.left < r.right) {
			TermMatcher tm = new TermMatcher(doc.getSearchTemplate(), tagset);
			LinkedList<MatchedToken> phr1 = null;
			LinkedList<MatchedToken> phr2 = null;
			String leftContext, rightContext;
			Range rg1 = null, rg2 = null, frg1 = null, frg2 = null;
			int i, pos = 0, div = 1, s = source.size();
			boolean firstTry = true;
			
			while (phr1 == null && phr2 == null) {
				boolean found = false;
				
				for (; pos < bgs.length; pos++) {
					div = bgs[pos].div;
					if (div > r.left && div <= r.right) {
						found = true;
						pos++;
						break;
					}
				}
				if (!found) break;
				else {
					rg1 = new Range(r.left, div - 1);
					rg2 = new Range(div, r.right);
					if (firstTry) {
						frg1 = rg1;
						frg2 = rg2;
						firstTry = false;
					}
					phr1 = new LinkedList<MatchedToken>();
					phr2 = new LinkedList<MatchedToken>();
					i = 0;
					for (MatchedToken mt : tokenList) {
						if (i > r.right) break;
						else if (i >= r.left) {
							if (i < div) phr1.add(mt);
							else phr2.add(mt);
						}
						i++;
					}
					tm.setMatchedTokens(phr1);
					if (tm.match()) phr1 = tm.getMatchedFragment().getMatchedTokens();
					else phr1 = null;
					tm.setMatchedTokens(phr2);
					if (tm.match()) phr2 = tm.getMatchedFragment().getMatchedTokens();
					else phr2 = null;
					if (phr1 != null || phr2 != null) {
						if (phr1 != null) {
							if (rg1.left <= 0) leftContext = null;
							else leftContext = source.get(rg1.left - 1).lemma;
							if (rg1.right >= s - 1) rightContext = null;
							else rightContext = source.get(rg1.right + 1).lemma;
							collectTermCandidate(phr1, docID, sr, leftContext, rightContext);
							collect_NPMI2(tokenList, source, docID, sr, bgs, rg1);
						}
						else collect_NPMI2(tokenList, source, docID, sr, bgs, rg1);
						if (phr2 != null) {
							if (rg2.left <= 0) leftContext = null;
							else leftContext = source.get(rg2.left - 1).lemma;
							if (rg2.right >= s - 1) rightContext = null;
							else rightContext = source.get(rg2.right + 1).lemma;
							collectTermCandidate(phr2, docID, sr, leftContext, rightContext);
							collect_NPMI2(tokenList, source, docID, sr, bgs, rg2);
						}
						else collect_NPMI2(tokenList, source, docID, sr, bgs, rg2);
					}
				}
			}
			if (phr1 == null && phr2 == null) {
				collect_NPMI2(tokenList, source, docID, sr, bgs, frg1);
				collect_NPMI2(tokenList, source, docID, sr, bgs, frg2);
			}
		}
	}
	
	public void collect_NPMI3(LinkedList<MatchedToken> tokenList, LinkedList<Token> source, int docID, SentenceRef sr, Bigram[] bgs, Range r)
	{
		if (r.left < r.right) {
			TermMatcher tm = new TermMatcher(doc.getSearchTemplate(), tagset);
			LinkedList<MatchedToken> phr0 = null;
			LinkedList<MatchedToken> phr1 = null;
			LinkedList<MatchedToken> phr2 = null;
			String leftContext, rightContext;
			Range rg1 = null, rg2 = null, frg1 = null, frg2 = null, brg0 = null, brg1 = null, brg2 = null;
			int i, pos = 0, div = 1, s = source.size();
			float goodNPMI = 0.0F, betterNPMI;
			boolean checkGoodPhrase = false, firstTry = true;
			
			while (phr1 == null || phr2 == null) {
				if (!checkGoodPhrase && phr0 != null) checkGoodPhrase = true;
				
				boolean found = false;
				
				for (; pos < bgs.length; pos++) {
					div = bgs[pos].div;
					if (div > r.left && div <= r.right) {
						found = true;
						pos++;
						break;
					}
				}
				if (!found) break;
				else {
					rg1 = new Range(r.left, div - 1);
					rg2 = new Range(div, r.right);
					if (firstTry) {
						frg1 = rg1;
						frg2 = rg2;
						firstTry = false;
					}
					phr1 = new LinkedList<MatchedToken>();
					phr2 = new LinkedList<MatchedToken>();
					i = 0;
					for (MatchedToken mt : tokenList) {
						if (i > r.right) break;
						else if (i >= r.left) {
							if (i < div) phr1.add(mt);
							else phr2.add(mt);
						}
						i++;
					}
					tm.setMatchedTokens(phr1);
					if (tm.match()) {
						phr1 = tm.getMatchedFragment().getMatchedTokens();
						if (phr0 == null) {
							goodNPMI = bigrams.get(bgs[pos - 1]);
							phr0 = phr1;
							brg0 = brg1 = rg1;
							brg2 = rg2;
						}
					}
					else {
						if (phr0 == null) {
							brg1 = rg1;
							brg2 = rg2;
						}
						phr1 = null;
					}
					tm.setMatchedTokens(phr2);
					if (tm.match()) {
						phr2 = tm.getMatchedFragment().getMatchedTokens();
						if (phr0 == null) {
							goodNPMI = bigrams.get(bgs[pos - 1]);
							phr0 = phr2;
							brg0 = brg2 = rg2;
							brg1 = rg1;
						}
					}
					else {
						if (phr0 == null) {
							brg1 = rg1;
							brg2 = rg2;
						}
						phr2 = null;
					}
				}
			}
			if (phr1 != null && phr2 != null) {
				if (checkGoodPhrase) {
					betterNPMI = bigrams.get(bgs[pos - 1]);
					if (goodNPMI * (float)doc.getPreferences().NPMIfactor / 100.0F > betterNPMI) phr0 = null;
				}
				else phr0 = null;
				if (phr0 == null) {
					if (rg1.left <= 0) leftContext = null;
					else leftContext = source.get(rg1.left - 1).lemma;
					if (rg1.right >= s - 1) rightContext = null;
					else rightContext = source.get(rg1.right + 1).lemma;
					collectTermCandidate(phr1, docID, sr, leftContext, rightContext);
					collect_NPMI3(tokenList, source, docID, sr, bgs, rg1);
					if (rg2.left <= 0) leftContext = null;
					else leftContext = source.get(rg2.left - 1).lemma;
					if (rg2.right >= s - 1) rightContext = null;
					else rightContext = source.get(rg2.right + 1).lemma;
					collectTermCandidate(phr2, docID, sr, leftContext, rightContext);
					collect_NPMI3(tokenList, source, docID, sr, bgs, rg2);
				}
				else {
					if (brg0.left <= 0) leftContext = null;
					else leftContext = source.get(brg0.left - 1).lemma;
					if (brg0.right >= s - 1) rightContext = null;
					else rightContext = source.get(brg0.right + 1).lemma;
					collectTermCandidate(phr0, docID, sr, leftContext, rightContext);
					collect_NPMI3(tokenList, source, docID, sr, bgs, brg0);
					if (brg0 == brg2) collect_NPMI3(tokenList, source, docID, sr, bgs, brg1);
					else collect_NPMI3(tokenList, source, docID, sr, bgs, brg2);
				}
			}
			else if (phr0 != null) {
				if (brg0.left <= 0) leftContext = null;
				else leftContext = source.get(brg0.left - 1).lemma;
				if (brg0.right >= s - 1) rightContext = null;
				else rightContext = source.get(brg0.right + 1).lemma;
				collectTermCandidate(phr0, docID, sr, leftContext, rightContext);
				collect_NPMI3(tokenList, source, docID, sr, bgs, brg0);
				if (brg0 == brg2) collect_NPMI3(tokenList, source, docID, sr, bgs, brg1);
				else collect_NPMI3(tokenList, source, docID, sr, bgs, brg2);
			}
			else {
				collect_NPMI3(tokenList, source, docID, sr, bgs, frg1);
				collect_NPMI3(tokenList, source, docID, sr, bgs, frg2);
			}
		}
	}
	
	public void calcNPMI()
	{
		Iterator<Bigram> it = bigrams.keySet().iterator();
		
		while (it.hasNext() && !cancelled) {
			Bigram key = it.next();
			
			float bf = bigrams.get(key) / nbigrams;
			float uf1 = (float)unigrams.get(key.first) / (float)ntok;
			float uf2 = (float)unigrams.get(key.second) / (float)ntok;
			float npmi = (float)(Math.log(bf / (uf1 * uf2)) / (-Math.log(bf)));
			
			bigrams.put(key, npmi);
		}
	}
	
	public void preprocessFiles()
	{
		Preferences prefs = doc.getPreferences();
		
		if (prefs.useUD && doc.acceptDET() == -1) {
			if (prefs.detHandling == 1) {
				if (prefs.detectDeterminers) checkDetRatio = true;
				else acceptDET = 1;
			}
			else if (prefs.detHandling == 2) acceptDET = 2;
			else acceptDET = 0;
		}
		for (int i = 0; i < searchFiles.length; i++) {
			if (cancelled || error) break;
			searchFiles[i] = preprocessFile(searchFiles[i]);
		}
		if (!error && checkDetRatio) {
			if (nouns > 0) {
				int r = (int)(Math.ceil(100.0F * (float)dets / (float)nouns));
				
				if (r > prefs.detRatio) acceptDET = 1;
				else acceptDET = 0;
			}
			else acceptDET = 0;
		}
	}
	
	public File preprocessFile(File file)
	{
		File outFile = file;
		
		corpusReader.setCurrentFile(file);
		corpusReader.checkFormat();
		
		int format = corpusReader.getFormat();
		
		if (format == CorpusReader.UNKNOWN_FORMAT) {
			boolean tag = true;
			String dir, name;
			int pos;
			
			dir = file.getAbsoluteFile().getParent();
			name = file.getName();
			pos = name.lastIndexOf(".");
			if (pos > 0) name = name.substring(0, pos);
			outFile = new File(dir + File.separator + name + ".conllu");
			if (doc.getPreferences().reuseTaggedFiles)
				if (outFile.exists()) tag = false;
			if (tag) runParser(doc.getPreferences().language, file);
		}
		if (checkDetRatio) {
			Token t;
			
			reportPreprocessing(outFile.getName());
			corpusReader.openFile();
			do {
				t = corpusReader.getNextToken();
				if (t != null && !t.stop()) {
					if (t instanceof UDToken) {
						UDToken tok = (UDToken)t;
						if (tok.UDRel.equals("det")) dets++;
						if (headPOS(tok)) nouns++;
					}
					else {
						error = true;
						break;
					}
				}
			} while (t != null && !cancelled);
			corpusReader.closeFile();
			if (doc.getPreferences().useUD) doc.changeProgress(2);
			else doc.changeProgress(1);
		}
		return outFile;
	}
	
	public void runParser(String language, File file)
	{
		parser = new SentenceParser(doc.getPreferences(), file.getAbsolutePath());
		
		reportTagging(file.getName());
		if (parser.getError() == null) parser.run();
		if (doc.getPreferences().useUD) doc.changeProgress(2);
		else doc.changeProgress(1);
		if (parser.getError() != null) {
			parser.report();
			doc.cancel();
		}
		else if (!parser.isParsed()) {
			parser.report("Error occured while parsing.");
			doc.cancel();
		}
		parser = null;
	}
	
	public void processFiles()
	{
		analyzedFiles = null;
		if (searchFiles != null) {
			analyzedFiles = new LinkedList<FileDescr>();
			for (File file : searchFiles) {
				if (cancelled) break;
				processFile(file);
			}
		}
	}
	
	public void processFile(File file)
	{
		File outFile = null;
		
		corpusReader.setCurrentFile(file);
		corpusReader.checkFormat();
		
		int format = corpusReader.getFormat();
		
		if (format == CorpusReader.UNKNOWN_FORMAT) {
			String dir, name;
			int pos;
			
			dir = file.getAbsoluteFile().getParent();
			name = file.getName();
			pos = name.lastIndexOf(".");
			if (pos > 0) name = name.substring(0, pos);
			outFile = new File(dir + File.separator + name + ".conllu");
			corpusReader.setCurrentFile(outFile, CorpusReader.CONLLU_FORMAT);
		}
		
		if (doc.getPreferences().useUD) report(termMap.size(), 0.0F);
		else report(maxTerms.size());
		corpusReader.openFile();
		analyze();
		corpusReader.closeFile();
		if (outFile != null) {
			if (!doc.getPreferences().reuseTaggedFiles) outFile.delete();
			corpusReader.setCurrentFile(file, format);
		}
		analyzedFiles.add(new FileDescr(file, format));
		fileID++;
		if (doc.getPreferences().useUD) report(corpusReader.getProcessedFileName(), termMap.size());
		else report(maxTerms.size());
	}
	
	public void reportTagging(String fName)
	{
		doc.reportTagging(fName);
	}
	
	public void reportPreprocessing(String fName)
	{
		doc.reportPreprocessing(fName);
	}
	
	public void report(int count)
	{
		doc.report(corpusReader.getProcessedFileName(), count);
	}
	
	public void report(String fName, int count)
	{
		doc.report(fName, count);
	}
	
	public void report(int count, float progress)
	{
		doc.report(count,  progress);
	}
	
	public void report(int processed, int max, float value)
	{
		doc.report(processed, max, value);
	}
	
	public void analyze()
	{
		Token t;
		LinkedList<Token> tokenList = null;
		int count1 = 0, count2 = 0;
		Preferences prefs = doc.getPreferences();
		
		do {
			t = corpusReader.getNextToken();
			if (t != null) {
				if (t.stop()) {
					if (tokenList != null) {
						nsent++;
						if (prefs.makeIndex)	{
							if (prefs.useUD) searchUD(tokenList, fileID, corpusReader.getSentenceStart(), corpusReader.getSentenceLength());
							else search(tokenList, fileID, corpusReader.getSentenceStart(), corpusReader.getSentenceLength());
						}
						else {
							if (prefs.useUD) searchUD(tokenList);
							else search(tokenList);
						}
						tokenList = null;
						corpusReader.initMultiWordTokens();
					}
				}
				else {
					if (prefs.useNPMIMethod) {
						Integer c = unigrams.get(t.lemma);
						
						if (c == null) unigrams.put(t.lemma, 1);
						else unigrams.put(t.lemma, c + 1);
						
						Token previous = corpusReader.getPreviousToken();
						
						if (previous != null) {
							Bigram bigram = new Bigram(previous.lemma, t.lemma);
							Float f = bigrams.get(bigram);
							
							if (f == null) bigrams.put(bigram, 1.0F);
							else bigrams.put(bigram, f + 1.0F);
							nbigrams++;
						}
					}
					if (tokenList == null) tokenList = new LinkedList<Token>();
					tokenList.add(t);
					ntok++;
				}
			}
			if (++count1 >= 1000) {
				count1 = 0;
				if (++count2 >= 25000) {
					count2 = 0;
					System.gc();
				}
				if (prefs.useUD) report(corpusReader.getProcessedFileName(), termMap.size());
				else report(maxTerms.size());
			}
		} while (t != null && !cancelled);
		if (!cancelled && tokenList != null) {
			nsent++; 
			if (prefs.makeIndex) {
				if (prefs.useUD) searchUD(tokenList, fileID, corpusReader.getSentenceStart(), corpusReader.getSentenceLength());
				else search(tokenList, fileID, corpusReader.getSentenceStart(), corpusReader.getSentenceLength());
			}
			else {
				if (prefs.useUD) searchUD(tokenList);
				else search(tokenList);
			}
			if (prefs.useUD) report(corpusReader.getProcessedFileName(), termMap.size());
			else report(maxTerms.size());
		}
	}
	
	public boolean isStopWord(Token t)
	{
		if (stopWords != null) {
			String w = t.lemma;
			
			return stopWords.contains(w);
		}
		return false;
	}
	
	public boolean containsMWT(LinkedList<UDToken> phrase, LinkedList<MultiWordToken> mwtList)
	{
		if (mwtList != null) {
			for (MultiWordToken mwt : mwtList) {
				boolean s = false;
				
				for (UDToken t : phrase) {
					if (s && t.index == mwt.endToken()) return true;
					if (t.index == mwt.startToken()) s = true;
				}
			}
		}
		return false;
	}
	
	public void search(LinkedList<Token> tokenList)
	{
		if (stopWords != null || compPreps != null) filterTokens(tokenList);
		matcher.setTokens(tokenList);
		while (matcher.find()) {
			MatchedFragment mf = matcher.getMatchedFragment();
			if (doc.useDocID()) mf.setDocID(corpusReader.getDocNumber());
			maxTerms.add(mf);
		}
	}
	
	public void search(LinkedList<Token> tokenList, int fid, long start, int len)
	{
		if (stopWords != null || compPreps != null) filterTokens(tokenList);
		matcher.setTokens(tokenList);
		while (matcher.find()) {
			MatchedFragment mf = matcher.getMatchedFragment();
			SentenceRef ref;
				
			if (fid == 0) ref = new SentenceRef(start, len, 0);
			else ref = new SentenceRefEx(fid, start, len, 0);
			mf.setRef(ref);
			if (doc.useDocID()) mf.setDocID(corpusReader.getDocNumber());
			maxTerms.add(mf);
		}
	}
	
	public void searchUD(LinkedList<Token> tokenList)
	{
		/*System.out.println("****************");
		for (Token t : tokenList) System.out.print(t.form + " ");
		System.out.println("****************");*/
		if (stopWords != null || compPreps != null || doc.acceptDET() >= 0) filterTokens(tokenList);
		
		LinkedList<UDToken> obligatoryNodes = new LinkedList<UDToken>();
		LinkedList<UDToken> termNodes = getTermTokens(tokenList, obligatoryNodes);
		UDStructure struct = createUDStructure(tokenList, termNodes, obligatoryNodes);
		LinkedList<PhraseWithContext> accepted = selectTermCandidates(struct, termNodes, obligatoryNodes, tokenList);
		LinkedList<MultiWordToken> mwtList = corpusReader.getMultiWordTokens();
		
//		writeAnnotatedSentence(annotation, tokenList, mwtList);
		if (accepted != null) {
			for (PhraseWithContext cntxph : accepted) {
				LinkedList<UDToken> phrase = cntxph.phrase();
				String context = cntxph.context();
				LinkedList<Token> ph = null;
				boolean continuous = isContinuous(phrase);
				
				if (containsMWT(phrase, mwtList)) ph = CorpusReader.replaceMWT(phrase, mwtList);
				
				Preferences prefs = doc.getPreferences();
				KeyLen keylen;
				
				if (ph == null) keylen = getKeyFromTokens(phrase);
				else keylen = getKeyFromTokens(ph);
				if (prefs.ignoreCase) keylen.key = keylen.key.toLowerCase();

				//writeTerm(annotation,  keylen.key, phrase, tokenList, mwtList);
				if (commonTerms == null || !commonTerms.contains(keylen.key)) {
					Term term;
					boolean inner = context != null;
					
					if (allowDiscontinuities) {
						term = termMap.get(keylen.key);
						if (term == null) {
							if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
							else term = new Term(keylen.key, keylen.len);
							termMap.put(keylen.key, term);
						}
					}
					else {
						if (continuous) {
							term = waitingMap.get(keylen.key);
							if (term != null) {
								waitingMap.remove(keylen.key);
								termMap.put(keylen.key, term);
							}
							else {
								term = termMap.get(keylen.key);
								if (term == null) {
									if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
									else term = new Term(keylen.key, keylen.len);
									termMap.put(keylen.key, term);
								}
							}
						}
						else {
							term = termMap.get(keylen.key);
							if (term == null) {
								term = waitingMap.get(keylen.key);
								if (term == null) {
									if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
									else term = new Term(keylen.key, keylen.len);
									waitingMap.put(keylen.key, term);
								}
							}
						}
					}
					term.addContext(context, null);
					if (ph == null) term.addForm(Form.createFromTokens(phrase), corpusReader.getDocNumber(), inner, prefs.collectAllForms);
					else term.addForm(Form.createFromTokens(ph), corpusReader.getDocNumber(), inner, prefs.collectAllForms);
				}
			}
		}
	}
	
	public void searchUD(LinkedList<Token> tokenList, int fid, long start, int len)
	{
		if (stopWords != null || compPreps != null || doc.acceptDET() >= 0) filterTokens(tokenList);
		
		LinkedList<UDToken> obligatoryNodes = new LinkedList<UDToken>();
		LinkedList<UDToken> termNodes = getTermTokens(tokenList, obligatoryNodes);
		UDStructure struct = createUDStructure(tokenList, termNodes, obligatoryNodes);
		LinkedList<PhraseWithContext> accepted = selectTermCandidates(struct, termNodes, obligatoryNodes, tokenList);
		LinkedList<MultiWordToken> mwtList = corpusReader.getMultiWordTokens();
		
		//writeAnnotatedSentence(annotation, tokenList, mwtList);
		if (accepted != null) {
			for (PhraseWithContext cntxph : accepted) {
				LinkedList<UDToken> phrase = cntxph.phrase();
				String context = cntxph.context();
				LinkedList<Token> ph = null;
				boolean continuous = isContinuous(phrase);
				
				if (containsMWT(phrase, mwtList)) ph = CorpusReader.replaceMWT(phrase, mwtList);
				
				Preferences prefs = doc.getPreferences();
				KeyLen keylen;
				
				if (ph == null) keylen = getKeyFromTokens(phrase);
				else keylen = getKeyFromTokens(ph);
				if (prefs.ignoreCase) keylen.key = keylen.key.toLowerCase();
				//writeTerm(annotation, keylen.key, phrase, tokenList, mwtList);
				if (commonTerms == null || !commonTerms.contains(keylen.key)) {
					Term term;
					boolean inner = context != null;
					
					if (allowDiscontinuities) {
						term = termMap.get(keylen.key);
						if (term == null) {
							if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
							else term = new Term(keylen.key, keylen.len);
							termMap.put(keylen.key, term);
						}
					}
					else {
						if (continuous) {
							term = waitingMap.get(keylen.key);
							if (term != null) {
								waitingMap.remove(keylen.key);
								termMap.put(keylen.key, term);
							}
							else {
								term = termMap.get(keylen.key);
								if (term == null) {
									if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
									else term = new Term(keylen.key, keylen.len);
									termMap.put(keylen.key, term);
								}
							}
						}
						else {
							term = termMap.get(keylen.key);
							if (term == null) {
								term = waitingMap.get(keylen.key);
								if (term == null) {
									if (prefs.makeGroups) term = new TermEx(keylen.key, keylen.len);
									else term = new Term(keylen.key, keylen.len);
									waitingMap.put(keylen.key, term);
								}
							}
						}
					}
					if (prefs.makeIndex) {
						SentenceRef ref;
						
						if (fid == 0) ref = new SentenceRef(start, len, phrase.getFirst().index);
						else ref = new SentenceRefEx(fid, start, len, phrase.getFirst().index);
						term.addSentenceRef(ref);
					}
					term.addContext(context, null);
					if (ph == null) term.addForm(Form.createFromTokens(phrase), corpusReader.getDocNumber(), inner, prefs.collectAllForms);
					else term.addForm(Form.createFromTokens(ph), corpusReader.getDocNumber(), inner, prefs.collectAllForms);
				}
			}
		}
//	annotation.println();
	}
	
/*	public void writeAnnotatedSentence(PrintWriter pw, LinkedList<Token> sentence, LinkedList<MultiWordToken> mwtList)
	{
		if (mwtList != null) sentence = CorpusReader.replaceMWT(sentence, mwtList);
		for (Token t : sentence) {
			pw.print(t.form);
			pw.print(" ");
		}
		pw.println();
	}
	
	public void writeTerm(PrintWriter pw, String term, LinkedList<? extends Token> phrase, LinkedList<Token> sentence, LinkedList<MultiWordToken> mwtList)
	{	
		if (mwtList != null) {
			sentence = CorpusReader.replaceMWT(sentence, mwtList);
			phrase = CorpusReader.replaceMWT(phrase, mwtList);
		}
		pw.print(term);
		pw.print("\t");
		for (Token tph : phrase) {
			int i = 0;
			
			for (Token ts : sentence) {
				if (tph == ts) {
					pw.print(i);
					pw.print("\t");
					break;
				}
				i++;
			}
		}
		pw.println();
	}*/
	
	public LinkedList<UDToken> getTermTokens(LinkedList<Token> tokenList, LinkedList<UDToken> obligatoryNodes)
	{
		LinkedList<UDToken> termNodes = new LinkedList<UDToken>();
		LinkedList<UDToken> hyphGroup = new LinkedList<UDToken>();
		UDToken prev = null;
		
		for (Token t : tokenList) {
			UDToken token = (UDToken)t;
			
			if (headPOS(token) || nonHeadPOS(token)) {
				termNodes.add(token);
				if (prev != null && prev.form.equals("-") && !prev.spaceAfter) {
					hyphGroup.add(token);
				}
			}
			else if (!token.skip() && token.UDPos.equals("PRON") && token.UDTag.contains("reflex=yes")) {
				termNodes.add(token);
			}
			else if (token.form.equals("-")) {
				if (prev != null && !prev.spaceAfter && !token.spaceAfter) {
					hyphGroup.add(prev);
					hyphGroup.add(token);
				}
			}
			else if (prev != null && prev.form.equals("-") && !prev.spaceAfter) {
				hyphGroup.add(token);
			}
			if (token.spaceAfter) {
				if (!hyphGroup.isEmpty()) {
					
					int s = hyphGroup.getFirst().index;
					int e = hyphGroup.getLast().index;
					
					for (UDToken g : hyphGroup) {
						if (!termNodes.contains(g)) termNodes.add(g);
						if (g.UDLink >= s && g.UDLink <= e) obligatoryNodes.add(g);
					}
					hyphGroup = new LinkedList<UDToken>();
				}
			}
			prev = token;
		}
		return termNodes;
	}
	
	public boolean acceptDET(UDToken n)
	{
		if (n.UDPos.equals("DET")) {
			switch (acceptDET) {
				case 0: return false;
				case 1: return true;
				case 2:
					if (n.UDTag.contains("definite=")) return true;
			}
			return false;
		}
		return true;
	}
	
	public UDStructure createUDStructure(LinkedList<Token> tokenList, LinkedList<UDToken> termNodes, LinkedList<UDToken> obligatoryNodes)
	{
		UDStructure struct = new UDStructure();
		
		for (Token ti : tokenList) {
			UDToken ni = (UDToken)ti;
			int j = 0;
			
			for (Token tj : tokenList) {
				UDToken nj = (UDToken)tj;
				
				j += 1;
				if (nj != ni && 
					ni.UDLink == j && 
					termNodes.contains(nj) && 
					(obligatoryRel(ni) || facultativeRel(ni) || obligatoryNodes.contains(ni))) 
				{
					struct.add(nj, ni);
				}
			}
		}
		
		/*for (UDToken t : termNodes) {
			LinkedList<UDToken> relatedNodes = struct.getRelatedNodes(t);
			
			System.out.print(t.lemma + ": [");
			if (relatedNodes != null) {
				for (UDToken r : relatedNodes) System.out.print(r.lemma + " ");
			}
			System.out.println("]");
		}
		System.out.println();*/
		
		boolean done = false;
		
		while (!done) {
			LinkedList<UDToken> toBeRemoved = new LinkedList<UDToken>();
			done = true;
			for (UDToken e : termNodes) {
				LinkedList<UDToken> relatedNodes = struct.getRelatedNodes(e);
				
				if (relatedNodes != null) {
					ListIterator<UDToken> it = relatedNodes.listIterator();
					
					while (it.hasNext()) {
						UDToken n = it.next();
						
						if (obligatoryRel(n) || obligatoryNodes.contains(n)) {
							if (!termNodes.contains(n))	{
								toBeRemoved.add(e);
								done = false;
							}
						}
						else if (!termNodes.contains(n)) it.remove();
					}
				}
			}
			if (!done) {
				for (UDToken e : toBeRemoved) termNodes.remove(e);
			}
		}
/*		for (UDToken t : termNodes) {
			LinkedList<UDToken> relatedNodes = struct.getRelatedNodes(t);
			
			System.out.print(t.lemma + ": [");
			if (relatedNodes != null) {
				for (UDToken r : relatedNodes) System.out.print(r.lemma + " ");
			}
			System.out.println("]");
		}
		System.out.println();*/
		return struct;
	}
	
	public LinkedList<PhraseWithContext> 
		selectTermCandidates(UDStructure struct, LinkedList<UDToken> termNodes, LinkedList<UDToken> obligatoryNodes, LinkedList<Token> tokenList)
	{
		SubphrasesIndex phIndex = new SubphrasesIndex();
		LinkedList<PhraseWithContext> acceptedPhrases = new LinkedList<PhraseWithContext>();
		ListIterator<UDToken> it;
		
		while (!termNodes.isEmpty()) {
			/*for (UDToken t : termNodes) {
				LinkedList<UDToken> relatedNodes = struct.getRelatedNodes(t);
				
				System.out.print(t.lemma + ": [");
				if (relatedNodes != null) {
					for (UDToken r : relatedNodes) System.out.print(r.lemma + " ");
				}
				System.out.println("]");
			}
			System.out.println();*/
			it = termNodes.listIterator();
			while (it.hasNext()) {
				UDToken t = it.next();
				LinkedList<UDToken> list = struct.getRelatedNodes(t);
				
				if (list == null || list.isEmpty()) {
					LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>> sub = phIndex.get(t);
					LinkedList<UDToken> ph = new LinkedList<UDToken>();
					LinkedList<UDToken> maxph = new LinkedList<UDToken>();
					LinkedList<PhraseWithContext> allPhrases = new LinkedList<PhraseWithContext>();
					
					ph.add(t);
					maxph.add(t);
					allPhrases.add(new PhraseWithContext(ph, maxph));
					if (sub != null) {
						adjustMaxTermLength(sub);
						
						ListIterator<Pair<UDToken, LinkedList<PhraseWithContext>>> subIt = sub.listIterator();
						
						allPhrases = composePhrases(subIt, allPhrases, maxph, obligatoryNodes);
					}
					for (UDToken ut : termNodes) {
						LinkedList<UDToken> related = struct.getRelatedNodes(ut);
						if (related != null && related.contains(t)) {
							phIndex.add(ut,  new Pair<UDToken, LinkedList<PhraseWithContext>>(t, allPhrases));
							related.remove(t);
							break;
						}
					}
					for (PhraseWithContext cntxph : allPhrases) cntxph.setMaxPhrase(maxph);
					if (headPOS(t)) acceptedPhrases.addAll(allPhrases);
					it.remove();
				}
			}
		}
		for (PhraseWithContext cntxph : acceptedPhrases) {
			LinkedList<UDToken> phrase = cntxph.phrase();
			
			Collections.sort(phrase);
			trimPhrase(phrase, obligatoryNodes);
			cntxph.setPhrase(phrase);
		}
		
		LinkedList<LinkedList<UDToken>> allMaxPhrases = new LinkedList<LinkedList<UDToken>>();
		
		for (PhraseWithContext cntxph1 : acceptedPhrases) {
			LinkedList<UDToken> maxPhrase = cntxph1.maxPhrase();
			if (maxPhrase != null && !allMaxPhrases.contains(maxPhrase)) {
				int len1 = maxPhrase.size();
				boolean found1 = false;
				
				trimPhrase(maxPhrase, obligatoryNodes);
				allMaxPhrases.add(maxPhrase);
				for (PhraseWithContext cntxph2 : acceptedPhrases) {
					LinkedList<UDToken> phrase = cntxph2.phrase();
					int len2 = phrase.size();
					
					if (len1 == len2) {
						ListIterator<UDToken> it1 = maxPhrase.listIterator();
						ListIterator<UDToken> it2 = phrase.listIterator();
						boolean found2 = true;
						
						while (it1.hasNext()) {
							if (it1.next() != it2.next()) {
								found2 = false;
								break;
							}
						}
						if (found2) {
							found1 = true;
							break;
						}
					}
				}
				if (!found1) cntxph1.setMaxPhrase(null);
			}
		}
		return filterNER(acceptedPhrases);
	}
	
	public void adjustMaxTermLength(LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>> sub)
	{
		int n = 0;
		
		for (Pair<UDToken, LinkedList<PhraseWithContext>> p : sub) {
			if (facultativeRel(p.first)) n++;
		}
		if(n >= 5) maxTermLength = 2;
		else maxTermLength = MAX_TERM_LEN;
	}
	
	public LinkedList<PhraseWithContext> filterNER(LinkedList<PhraseWithContext> phrases)
	{
		ListIterator<PhraseWithContext> it = phrases.listIterator();
		
		while (it.hasNext()) {
			LinkedList<UDToken> ph = it.next().phrase();
			boolean ner = false;
			boolean bad = false;
			int index = 0;
			
			for (UDToken t : ph) {
				if (t.ner == 'B') {
					if (ner) {
						bad = true;
						break;
					}
					else {
						ner = true;
						index = t.index;
					}
				}
				else if (t.ner == 'I') {
					if (ner) {
						if (t.index > index + 1) {
							bad = true;
							break;
						}
						index = t.index;
					}
					else {
						bad = true;
						break;
					}
				}
				else if (t.ner == 'E') {
					if (ner) {
						if (t.index > index + 1) {
							bad = true;
							break;
						}
						ner = false;
					}
					else {
						bad = true;
						break;
					}
				}
				else {
					if (ner) {
						bad = true;
						break;
					}
				}
			}
			if (ner) bad = true;
			if (bad) it.remove();
		}
		return phrases;
	}
	
	LinkedList<PhraseWithContext> composePhrases(ListIterator<Pair<UDToken, LinkedList<PhraseWithContext>>> subIt, 
		LinkedList<PhraseWithContext> allPhrases, LinkedList<UDToken> maxph, LinkedList<UDToken> obligatoryNodes)
	{
		if (subIt.hasNext()) {
			Pair<UDToken, LinkedList<PhraseWithContext>> pair = subIt.next();
			LinkedList<PhraseWithContext> subphrases = pair.second;
			
			if (subphrases != null) {
				LinkedList<PhraseWithContext> newList = new LinkedList<PhraseWithContext>();
				
				for (PhraseWithContext cntxph1 : allPhrases) {
					LinkedList<UDToken> ph1 = cntxph1.phrase();
					int len1 = ph1.size();
					
					for (PhraseWithContext cntxph2 : subphrases) {
						LinkedList<UDToken> ph2 = cntxph2.phrase();
						int len2 = ph2.size();
						
						if (len1 + len2 <= maxTermLength) {
							LinkedList<UDToken> ph = new LinkedList<UDToken>(ph1);
							LinkedList<PhraseWithContext> subPhrases;
							
							if (cntxph1.subPhrases() == null) subPhrases = new LinkedList<PhraseWithContext>();
							else subPhrases = new LinkedList<PhraseWithContext>(cntxph1.subPhrases());
							subPhrases.add(cntxph2);
							ph.addAll(ph2);
							for (UDToken t : ph2) {
								if (!maxph.contains(t)) maxph.add(t);
							}
							newList.add(new PhraseWithContext(ph, maxph, subPhrases));
						}
						if (!obligatoryNodes.contains(pair.first) && facultativeRel(pair.first)) {
							boolean found = false;
							
							for (PhraseWithContext cntxph : newList) {
								if (ph1 == cntxph.phrase()) {
									found = true;
									break;
								}
							}
							if (!found) newList.add(cntxph1);
						}
					}
				}
				allPhrases = composePhrases(subIt, newList, maxph, obligatoryNodes);
			}		
		}
		else {
			Collections.sort(maxph);
			for (PhraseWithContext cntxph : allPhrases) cntxph.setMaxPhrase(maxph);
		}
		return allPhrases;
	}
	
	
	public void trimPhrase(LinkedList<UDToken> phrase, LinkedList<UDToken> obligatoryNodes)
	{
		ListIterator<UDToken> it = phrase.listIterator();
		
		while (it.hasNext()) {
			UDToken t = it.next();
			String pos = t.UDPos;
			
			if (!obligatoryNodes.contains(t)) {
				if (pos.equals("DET")) it.remove();
				if (pos.equals("ADP")) {
					UDToken n = getHeadOf(t, phrase);
					UDToken h = getHeadOf(phrase);
					
					if (n == h) it.remove();
					else break;
				}
			}
			else break;
		}
	}
	
	public boolean isContinuous(LinkedList<UDToken> phrase)
	{
		int prevIndex = 0;
		
		for (UDToken t : phrase) {
			if (prevIndex > 0) {
				if (t.index > prevIndex + 1) return false;
			}
			prevIndex = t.index;
		}
		return true;
	}
	
	public static UDToken getHeadOf(LinkedList<UDToken> phrase)
	{
		int s = phrase.getFirst().index;
		int e = phrase.getLast().index;
		
		for (UDToken token : phrase) {
			if (token.UDLink < s || token.UDLink > e) return token;
		}
		return null;
	}
	
	public static UDToken getHeadOf(UDToken t, LinkedList<UDToken> phrase)
	{
		int index = t.UDLink;
		
		for (UDToken token : phrase) {
			if (index == token.index) return token;
		}
		return null;
	}

	public static boolean headPOS(UDToken t)
	{
		if (t.skip()) return false;
		if (t.UDPos.equals("VERB") && t.UDTag.contains("verbform=ger")) return true;
		return onTheList(t.UDPos, HEAD_POS);
	}
	
	public static boolean nonHeadPOS(UDToken t)
	{
		if (t.skip()) return false;
		return onTheList(t.UDPos, NON_HEAD_POS);
	}
	
	public static boolean obligatoryRel(UDToken t)
	{
		if (t.skip()) return false;
		return onTheList(t.UDRel, OBLIGATORY_REL);
	}
	
	public static boolean facultativeRel(UDToken t)
	{
		if (t.skip()) return false;
		return onTheList(t.UDRel, FACULTATIVE_REL);
	}
	
	public static boolean headPhraseRel(UDToken t)
	{
		return onTheList(t.UDRel, HEAD_PHRASE_REL);
	}
	
	public static boolean onTheList(String s, String[] list)
	{
		for (String e : list) {
			if (e.equals(s)) return true;
		}
		return false;
	}
	
	public void filterTokens(LinkedList<Token> tokenList)
	{
		if (compPreps != null) {
			compprepMatcher.setTokens(tokenList);
			while (compprepMatcher.find()) {
				LinkedList<MatchedToken> mt = compprepMatcher.getMatchedFragment().getMatchedTokens();
				
				for (MatchedToken t : mt) t.token.markAsSkipToken();
			}
		}
		if (stopWords != null) {
			for (Token t : tokenList) {
				if (isStopWord(t)) t.markAsSkipToken();
			}
		}
		if (doc.getPreferences().useUD && acceptDET >= 0) {
			for (Token t : tokenList) {
				UDToken ut = (UDToken)t;
				
				if (!acceptDET(ut)) t.markAsSkipToken();
			}
		}
	}
	
	public void cancel()
	{
		if (parser != null) parser.cancel();
		cancelled = true;
	}
	
	public boolean isCancelled()
	{
		return cancelled;
	}
	
	public Term[] getTerms()
	{
		return terms;
	}
	
	public HashMap<String, Term> getTermMap()
	{
		return termMap;
	}
	
	public int getNumberOfSentences()
	{
		return nsent;
	}
	
	public int getNumberOfTokens()
	{
		return ntok;
	}
	
	public int getNumberOfTerms()
	{
		if (terms == null) return 0;
		return terms.length;
	}
	
	public LinkedList<FileDescr> getAnalyzedFiles()
	{
		return analyzedFiles;
	}
	
	private class KeyLen
	{
		
		public String key;
		public int len;
		
		public KeyLen(String key, int len)
		{
			this.key = key;
			this.len = len;
		}
		
	}
	
	private class BigramComp implements Comparator<Bigram>
	{

		public int compare(Bigram b1, Bigram b2) 
		{
			float f1 = bigrams.get(b1), f2 = bigrams.get(b2);

			if (f1 < f2) return -1;
			else if (f1 > f2) return 1;
			return 0;
		}
		
	}
	
	private class UDStructure
	{
		
		private HashMap<Integer, LinkedList<UDToken>> struct;
		
		public UDStructure()
		{
			struct = new HashMap<Integer, LinkedList<UDToken>>();
		}
		
		public void add(UDToken t1, UDToken t2)
		{
			LinkedList<UDToken> list = struct.get(t1.index);
			
			if (list == null) {
				list = new LinkedList<UDToken>();
				struct.put(t1.index, list);
			}
			list.add(t2);
		}
		
		public LinkedList<UDToken> getRelatedNodes(UDToken t)
		{
			return struct.get(t.index);
		}
		
	}
	private class PhraseWithContext
	{
		
		private LinkedList<UDToken> ph;
		private LinkedList<UDToken> maxph;
		private LinkedList<PhraseWithContext> subPhrases;
		
		public PhraseWithContext(LinkedList<UDToken> ph, LinkedList<UDToken> maxph)
		{
			this.ph = ph;
			this.maxph = maxph;
			subPhrases = null;
		}
		
		public PhraseWithContext(LinkedList<UDToken> ph, LinkedList<UDToken> maxph, LinkedList<PhraseWithContext> subPhrases)
		{
			this(ph, maxph);
			this.subPhrases = subPhrases;
		}
		
		public void setPhrase(LinkedList<UDToken> ph)
		{
			this.ph = ph;
		}
		
		public LinkedList<UDToken> phrase()
		{
			return ph;
		}
		
		public LinkedList<UDToken> maxPhrase()
		{
			return maxph;
		}
		
		public LinkedList<PhraseWithContext> subPhrases()
		{
			return subPhrases;
		}
		
		public void setMaxPhrase(LinkedList<UDToken> maxph)
		{
			this.maxph = maxph;
			if (subPhrases != null) {
				for (PhraseWithContext cntxph : subPhrases) cntxph.setMaxPhrase(maxph);
			}
		}
		
		public String context()
		{
			if (maxph == null) return null;
			
			StringBuffer buffer = new StringBuffer();
			String lcntx = null;
			String rcntx = null;
			
			for (UDToken t : maxph) {
				if (ph.contains(t)) {
					if (lcntx != null) {
						buffer.append(lcntx);
						lcntx = rcntx = null;
					}
				}
				else {
					if (lcntx == null) rcntx = t.lemma;
					lcntx = t.lemma;
				}
			}
			if (rcntx != null) buffer.append(rcntx);
			if (buffer.length() == 0) return null;
			return buffer.toString();
		}
		
	}
	
	private class SubphrasesIndex
	{
		
		private HashMap<Integer, LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>>> phIndex;
		
		public SubphrasesIndex()
		{
			phIndex = new HashMap<Integer, LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>>>();
		}
		
		public void add(UDToken t, Pair<UDToken, LinkedList<PhraseWithContext>> pair)
		{
			LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>> subphrases = phIndex.get(t.index);
			
			if (subphrases == null) {
				subphrases = new LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>>();
				phIndex.put(t.index, subphrases);
			}
			subphrases.add(pair);
		}
		
		public LinkedList<Pair<UDToken, LinkedList<PhraseWithContext>>> get(UDToken t)
		{
			return phIndex.get(t.index);
		}
		
	}
	
}
