''' 
nakladka na XLE analizujaca parsowany tekst przy pomocy morfeusza i generujaca leksykon
autorzy: bartosz zaborowski (bz233728@students.mimuw.edu.pl), agnieszka patejuk (aep@ipipan.waw.pl)

This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://sam.zoy.org/wtfpl/COPYING for more details.

'''


from common2xle import *
from morftagtrans import *
import sys
import subprocess
from subprocess import Popen, PIPE
import readline
import os

INTERAKTYWNIE = 1
WSADOWO = 2 

# change this path accordingly
SCIEZKA_DO_XLE = "/opt/xle/bin/xle"

pozaslownikiem = open("morf-oov", 'w+')
interpunkcja = open("morf-punct", 'w+')

# argumenty

if len(sys.argv) != 4 and (len(sys.argv) != 3 or sys.argv[1] != "-i"):
	print "use:"
	print "for batch parsing:"
	print "\tpython "+sys.argv[0]+" input_text output_text output_lexicon\n"
	print "for interactive use with XLE (you need to point to output_lexicon in the grammar config file):"
	print "\tpython "+sys.argv[0]+" -i output_lexicon\n\n"
	print "required files:\n"
	print SLOWNIK_CZASOWNIKOW+" -- converted valence dictionary of verbs\n"
	print SLOWNIK_NIEWLASCIWYCH+" -- converted valence dictionary of quasi verbs\n"
	print SLOWNIK_RZECZOWNIKOW+" -- converted valence dictionary of nouns\n"
	print SLOWNIK_PRZYMIOTNIKOW+" -- converted valence dictionary of adjectives\n"
	print SLOWNIK_SKROTOW+" -- dictionary of abbreviation expansions\n"
	print LEX_CORE+" -- manually created lexicon,"
	print "\twords defined there are stored in "+LEX_CORE_ALT+" rather than in output_lexicon\n"
	print "dicts2xle/gfjp_slowwyj.pl -- dictionary used for filtering Morfeusz output\n"
	exit()

if sys.argv[1] == "-i":
	mode = INTERAKTYWNIE
	LEKSYKON_WYJSCIOWY = sys.argv[2]
else:
	mode = WSADOWO
	LEKSYKON_WYJSCIOWY = sys.argv[3]

# biblioteka funkcji

def anyaglt(interps):
	for interp in interps:
		for segm in interp:
			if segm[1] != None:
				if segm[2].split(':')[0] == 'aglt':
					return [aglttok(interps)]
	return interps

def aglttok(interps):
	for interp in interps:
		for i in range(len(interp)):
			if interp[i][2].split(':')[0] == 'aglt':
				if interp[i-1][2].split(':')[0] == 'praet':
					if interp[i-1][0] != u'udzia\u0142':
						return interp
				if interp[i-1][2].split(':')[0] == 'winien':
					return interp
				if interp[i-1][2].split(':')[0] == 'comp' and interp[i-1][1] != 'kiedy':
					return interp
				if interp[i-1][2].split(':')[0] == 'qub' and interp[i-1][1] == 'by':
					return interp
	for interp in interps:
		aglt = False
		for segm in interp:
			if segm[1] != None:
				if segm[2].split(':')[0] == 'aglt':
					aglt = True
		if aglt == False:
			return interp

def tokeny(interps):
	if interps == None:
		return []
	ret = []
	for i in anyaglt(interps)[0]:
		ret.append(i[0])
	return ret

def tokenize(line):
	return line.split()

def przetworz_zdanie(line, localdct):
	zdanie = []
	words = tokenize(line.decode('utf-8'))
	for word in words:
		an = analyse(word, False, False, False)
		# tags are translated and expanded
		for intp in expand_analyses(tagtrans(an)):
			for segm in intp:
				# tuple to list conversion (for gfjpcat)
				segm = list(segm)
				orth = segm[0]
				if not localdct.has_key(orth):
					localdct[orth] = [segm]
				else:
					if segm not in localdct[orth]:
						localdct[orth].append(segm)
		# to avoid doubled tokens after translation
		for token in tokeny(an):
			zdanie.append(token)
	return " ".join(zdanie)				

def czytaj_polecenie():
	try:
		line = raw_input("% ")
	except EOFError:
		return None
	return line


# dzialanie wsadowe

if mode == WSADOWO:
	input = open(sys.argv[1], 'r')
	output = open(sys.argv[2], 'w')
	localdct = {}

	for line in input:
		if len(line.strip()) > 0:
			output.write (przetworz_zdanie(line.strip(), localdct).encode('utf-8')+"\n\n")

	output.close()
	input.close()
	zapisz_slownik(LEKSYKON_WYJSCIOWY, localdct)
	for item in punct:
		interpunkcja.write(item.encode("utf-8") + "\n")
	for item in oov:
		pozaslownikiem.write(item.encode("utf-8") + "\n")
	exit(0)

# dzialanie interaktywne

# there must be a lexicon file for XLE to load the grammar
zapisz_slownik(LEKSYKON_WYJSCIOWY, {})

histfile = os.path.join(os.environ["HOME"], ".pyxlemorf_history")
try:
	readline.read_history_file(histfile)
except IOError:
	pass

xle = Popen([SCIEZKA_DO_XLE], stdin=PIPE, close_fds=True)

line = czytaj_polecenie()
while line != None:
	l = line.strip()
	if l[0:5] == "parse":
		idx1 = l.find("{")
		idx2 = l.find("}")
		zdanie = l[idx1+1:idx2]
		localdct = {}
		wynik = przetworz_zdanie(zdanie, localdct)
		zapisz_slownik(LEKSYKON_WYJSCIOWY, localdct)
		xle.stdin.write(("parse {"+wynik+"}\n").encode("utf-8"))
	else:
		xle.stdin.write(line)
	line = czytaj_polecenie()

# xle.stdin.write("close-all\n")
xle.stdin.write("exit\n")
xle.stdin.close()

readline.write_history_file(histfile)

