''' 
wspolny modul wykorzystywany przez nakladka na XLE oraz tworzenia leksykonu na podstawie XML-i ze Skladnicy/NKJP
autorzy: bartosz zaborowski (bz233728@students.mimuw.edu.pl), agnieszka patejuk (aep@ipipan.waw.pl)

This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://sam.zoy.org/wtfpl/COPYING for more details.

'''

import re
import string
from morfeusz_nkjp import *

NAGLOWEK_SLOWNIKA = "POLFIE POLISH LEXICON (1.0)"

LEX_CORE = "POLFIE-lex-core-auto.lfg"
LEX_CORE_ALT = "lex-core-auto-alt"

SLOWNIK_CZASOWNIKOW = "dicts2xle/dict-val-verbs"
SLOWNIK_NIEWLASCIWYCH = "dicts2xle/dict-val-quasi"
SLOWNIK_RAMEK_DOMYSLNYCH = "dicts2xle/dict-val-default"
SLOWNIK_RZECZOWNIKOW = "dicts2xle/dict-val-nouns"
SLOWNIK_PRZYMIOTNIKOW = "dicts2xle/dict-val-adjs"
SLOWNIK_SKROTOW = "dicts2xle/gfjp_slowskr.pl"
SLOWNIK_COMP = "dicts2xle/compclasses"

valbase = ["fin", "praet", "inf", "pact", "ppas", "pcon", "pant", "impt", "imps", "pred", "ger", "winien", "bedzie"]

passivisable = ["TRANS", "TRANSCOR", "DITRANS", "DITRANSCOR", "DITRANSP", "DITRANSPCOR", "DITRANSCORP", "PREDDITRANSP", "DITRANSTHPCOR", "DISUB", "PREDCOPULAR", "TRITRANS", "TRITRANSP", "TRITRANSSUB", "TRITRANSCOR", "TRITRANSPCOR", "PREDTRITRANS", "TRITRANSPP", "TRITRANSPSUB", "TRITRANSTHPSUB", "ADVNP", "ADVNPNP", "ADVADVNP"]

oov = []
punct = []
segf = []

def extraInfo(lst, basename, suffix):
	if len(lst) > 0:
		out = open(basename+"-"+suffix, "w")
		for item in lst:
			out.write(item.encode("utf-8") + "\n")

def anySegf(sent):
	for word in sent.split():
		if word.decode("utf-8") in segf:
			return True
	return False

def eliminateSF(infile):
	okfile = open(infile+'-noSEGF', 'w')
	segfile = open(infile+'-SEGF', 'w')
	for line in open(infile, 'r'):
		line = line.strip()
		if len(line) != 0:
			if anySegf(line) == False:
				okfile.write(line+'\n\n')
			else:
				segfile.write(line+'\n\n')

def makeValDict(dictfile):
	dictfile = open(dictfile, 'r')
	outdict = {}
	for line in dictfile:
		line = line.decode("utf-8")
		if line[0] != '(':
			continue
		baza = line.strip()[1:-1].split()[1].lower().split('_')[0]
		if outdict.has_key(baza):
			outdict[baza].append("@"+line.strip().upper())
		else:
			outdict[baza] = ["@"+line.strip().upper()]
	dictfile.close()
	return outdict

valverb = makeValDict(SLOWNIK_CZASOWNIKOW)
valquasi = makeValDict(SLOWNIK_NIEWLASCIWYCH)
valdef = makeValDict(SLOWNIK_RAMEK_DOMYSLNYCH)
valnoun = makeValDict(SLOWNIK_RZECZOWNIKOW)
valadj = makeValDict(SLOWNIK_PRZYMIOTNIKOW)

def makeCompDict(file):
	dct = {}
	for line in open(file, 'r'):
		line = line.strip().decode('utf-8')
		parts = line.split('\t')
		compclass = parts[0]
		members = parts[1].split()
		for m in members:
			dct[m] = compclass
	return dct

compdict = makeCompDict(SLOWNIK_COMP)

def cprep(pred):
    return "_".join(pred.split())

def stopOrNot(brev, stop):
    if stop == "pun":
        return brev + "."
    else:
        return brev

def splitter(line):
    return line.split('%')[0].strip()[8:-2].split(',')

def makeBrevEntry(line):
    split = splitter(line)
    pred = cprep(split[0][1:-1])
    stop = split[2].strip()
    brev = stopOrNot(split[1][2:-1], stop)
    oblig = split[3].strip()
    tagorg = split[-1].strip()
    entries = []
    if tagorg == "ini":
        tags = "subst:sg:nom.gen.dat.acc.inst.loc:m1.m2.m3.f.n"
    else:
        tags = tagorg
    for tag in list(expand_tags(tags)):
        entries.append(tag)
    return [brev, pred, entries]

def makeBrevDict(path):
    brevdict = {}
    for line in open(path, 'r'):
        line = line.decode("utf-8")
        if not len(line.strip()) == 0:
            if not re.match("%.*", line) and not line == "" and not re.match(".*\?\?\?.*", line):
                expanded = makeBrevEntry(line)
                form = expanded[0]
                full = expanded[1]
                tags = expanded[2]
                if not brevdict.has_key(form):
                    brevdict[form] = [full, []]
                for tag in tags:
                    if tag not in brevdict[form][1]:
                        brevdict[form][1].append(tag)
    return brevdict

brevdict = makeBrevDict(SLOWNIK_SKROTOW)

lex_core = open(LEX_CORE, 'r')

wyjatki = {}
first = True
for line in lex_core:
	if len(line.strip()) == 0:
		continue
	if first:
		first = False
		continue
	if line.strip() == "----":
		break
	l = line.decode("utf-8")
	if l[0] in string.whitespace:
		continue
	wyjatki[l.split()[0]] = True

lex_core.close()

def valPos(entries, pos):
	if pos == "ppas":
		newentries = []
		for entry in entries:
			if entry.split()[0][2:] in passivisable:
				newentries.append(entry)
		return newentries
	else:
		return entries

def walencyjnosc(haslo, elements):
	haslo = haslo.lower()
	pos = elements[0]
	tag = elements[1]
	if pos in valbase:
		# default
		dcts = [valverb]
		# CHECK: if valverb's nonempty and valquasi also, use both? (WIADOMO)
		if pos == "pred":
			# some have only normal entries (TO)
			if valquasi.has_key(haslo):
				dcts = [valquasi]
		# analytic future, WIDAC, etc
		if pos == "inf":
			dcts = [valverb, valquasi]
		if pos in ["fin", "praet"]:
			if tag[0] == "sg":
				# TODO: this won't work properly with morfeusz
				if tag[1] in ['ter', 'n']:
					dcts = [valverb, valquasi]
		entries = []
		for dct in dcts:
			if dct.has_key(haslo):
				for valence in dct[haslo]:
					entries.append(valence)
		if len(entries) > 0:
			return valPos(entries, pos)
		else:
			if haslo not in oov:
				oov.append(haslo)
			for valence in valdef['insertlemma']:
				entries.append(valence.replace('INSERTLEMMA', haslo.upper()))
			return valPos(entries, pos)
	if pos == "subst":
		if valnoun.has_key(haslo):
			return valnoun[haslo]
	if pos == "adj":
		if valadj.has_key(haslo):
			return valadj[haslo]
	return []

def trsl(fulltag):
	# nonagreeing numerals are *NOT* nominative
	if re.match("num:.*:nom:.*:rec", fulltag):
		return trsl(fulltag.replace("nom", "acc"))
	else:
		nowy = []
		for tag in fulltag.split(":"):
			nowy.append(tlumacz(tag))
		return nowy

def tlumacz(tag):
	if tag == "pri":
		return "1"
	if tag == "sec":
		return "2"
	if tag == "ter":
		return "3"
	if tag in ["n1", "n2", "p2", "p3"]:
		return "n"
	if tag == "p1":
		return "m1"
	else:
		return tag

def choosemacro(elements, base):
	pos = elements[0]
	tags = elements[1:]
	if pos == "adv":
		if len(tags) == 1:
			return "adv-deg"
	if pos == "ppron12":
		if len(tags) > 4:
			return "ppron-acc"
		else:
			return "ppron"
	if pos == "ppron3":
		if len(tags) == 4:
			return "ppron"
		if len(tags) == 5:
			return "ppron-acc"
		if len(tags) == 6:
			return "ppron-pprep"
	if pos == "prep":
		if len(tags) > 1:
			return "prep-voc"
	if pos == "praet":
		if len(tags) > 3:
			return "praet-aglt"
	# valence of nouns and adjectives
	if pos in ["subst", "adj"]:
		if not walencyjnosc(base, elements) == []:
			return pos+"-core"
	if pos in ["conj", "preconj"]:
		return conjmacro(base, pos).upper()
	if pos == "comp":
		return compmacro(base, compdict).upper()
	return pos

def conjmacro(base, pos):
	if base in ['albo', u'b\u0105d\u017a', 'i', 'lub']:
		return pos+'-'+base
	if base in ['ani', 'ni']:
		return pos+'-ani'
	if pos == 'conj':
		if base in ['a', 'ale', 'lecz']:
			return pos+'-a'
		if base in ['czy', 'czyli', 'mianowicie', 'natomiast', 'jak', u'za\u015b']:
			return pos+'-'+base
		if base == 'oraz':
			return pos+'-i'
	if pos == 'preconj':
		if base in ['tak', u'zar\xf3wno', 'nie']:
			return pos+'-'+base
	return pos+'-unknown'

def compmacro(base, dct):
	if dct.has_key(base):
		return 'comp-'+dct[base]
	else:
		return 'comp '+base

def macro(interp):
	base = interp[1]
	elements = trsl(interp[2])
	pos = elements[0]
	head = "@("+choosemacro(elements, base)
	if pos not in valbase:
		if pos not in ["conj", "comp", "preconj"]:
			head += " "+base
	if len(elements) > 1:
		head += " "
		head += " ".join(elements[1:])
	return head.upper()+")"

def cat(interp):
	base = interp[1]
	postag = interp[2].split(':')[0]
	if postag == 'qub':
		if base in ['niech', u'niech\u017ce', 'niechaj', u'niechaj\u017ce', 'by', 'nie', u'si\u0119']:
			if base == 'nie':
				return 'NEG'
			if base == u'si\u0119':
				return 'RM'
			else:
				return 'MM'
	return postag.upper()

def gfjpcat(dct):
	for key in dct:
		for interp in dct[key]:
			if interp[1] in [u'dop\xf3ty', u'p\xf3ty', 'jednak', 'to', 'przeto', u'wi\u0119c', 'zatem', u'tote\u017c']:
				if interp[2] == 'conj':
					interp[2]='comp'
			if interp[1] in [u'w\xf3wczas', 'wtedy']:
				if interp[2] == 'adv':
					dct[key].append([key, interp[1], 'comp'])
			if interp[2] == 'conj':
				if interp[1] in ['albo', u'b\u0105d\u017a', 'i', 'lub', 'ani', 'ni']:
					# adding the PRECONJ interp
					dct[key].append([key, interp[1], 'preconj'])
				if interp[1] in ['tak', u'zar\xf3wno', 'nie']:
					# replacing the CONJ interp with PRECONJ
					interp[2]='preconj'
	
def zapisz_slownik(plik_slownika, dct, razem_z_altem = True):
	leksykon = open(plik_slownika, 'w+')
	leksykon.write(NAGLOWEK_SLOWNIKA + "\n\n")
	if razem_z_altem:
		leksykon_alt = open(LEX_CORE_ALT, 'w+')
		leksykon_alt.write(NAGLOWEK_SLOWNIKA + "\n\n")
	
	# replacing/adding some dictionary entries (mostly conj and comp)
	gfjpcat(dct)
	for slowo in dct.itervalues():
		token = slowo[0][0]
		wpisy = []
		skrotykropkowe = []
		haslozkropka = ""
		for interp in slowo:
			if interp[2] == "interp":
				# attention, check
				if interp[0] not in punct:
					punct.append(interp[0])
			else:
				if interp[1] == None:
					continue
				if interp[2].split(':')[0] == 'brev':
					brevform = stopOrNot(token, interp[2].split(':')[1])
					if brevform == token:
						dest = wpisy
					else:
						dest = skrotykropkowe
						haslozkropka = brevform
					if not brevdict.has_key(brevform):
						brevform = brevform.lower()
					# added: some brev entries are missing (such as a. = albo), these won't be included
					if brevdict.has_key(brevform):
						fullform = brevdict[brevform][0]
						expandedtags = brevdict[brevform][1]
						for exptag in expandedtags:
							newinterp = [brevform, fullform, exptag]
							makeEntry(newinterp, dest)
				else:
					makeEntry(interp, wpisy)
		if len(wpisy) > 0:
			saveEntries(token, wpisy, leksykon, leksykon_alt, razem_z_altem)
		if len(skrotykropkowe) > 0:
			saveEntries(haslozkropka, skrotykropkowe, leksykon, leksykon_alt, razem_z_altem)
	
	leksykon.write("----\n\n");
	leksykon.close()
	if razem_z_altem:
		leksykon_alt.write("----\n\n");
		leksykon_alt.close()

def formatVal(vallist):
	head = "{ "
	head += " | ".join(vallist)
	return head+" }"

def makeEntry(interp, wpisy):
	wpis = cat(interp)
	wpis += " * "
	posmakro = macro(interp)
	walencje = sorted(set(walencyjnosc(interp[1], [interp[2].split(':')[0], interp[2].split(':')[1:]])))
	if len(walencje) > 0:
		# handles reduced entries of nouns and adjectives with valence entries
		if re.search(".*-CORE", posmakro):
			reduced = wpis + posmakro.replace("-CORE", "")
			if reduced not in wpisy:
				wpisy.append(reduced)
		if len(walencje) > 1:
			wpis += formatVal(walencje)
		else:
			wpis += walencje[0]
		wpis += "\n\t\t"
	wpis += posmakro
	# to avoid repeated entries
	if wpis not in wpisy:
		wpisy.append(wpis)
	# for AUX entries (CHECK: number, POS for BEDZIE)
	if interp[1] == u'by\u0107':
		tag = interp[2].split(":")
		if tag[0] not in ["aglt", "impt"]:
			wpisaux = ""
			if tag[0] == "bedzie":
				wpisaux += "AUX * " + posmakro
			if tag[0] in ["fin", "praet"]:
				# all forms, because TO uses agreeing AUX (ludzie to sa wilki, panstwo to jestem ja)
				wpisaux += "AUX * " + posmakro.replace("@(FIN", "@(FIN-AUX").replace("@(PRAET", "@(PRAET-AUX")
			if tag[0] == "inf":
				wpisaux += "AUX * " + posmakro
			if wpisaux != "":
				wpisy.append(wpisaux)

def saveEntries(token, wpisy, leksykon, leksykon_alt, razem_z_altem):
	if len(wpisy) > 52:
		if token not in segf:
			segf.append(token)
	caly = token+"\t"+wpisy[0]
	for wpis in wpisy[1:]:
		caly += ";\n\t"+wpis
	# to avoid problems with floating point in lemma
	caly = caly.replace(",", "`,")
	caly = caly.replace(".", "`.")
	# to avoid problems with invisible entries
	caly = caly.replace("'", "`'")
	caly += ".\n\n"
	if wyjatki.has_key(token):
		if razem_z_altem:
			leksykon_alt.write(caly.encode("utf-8"))
	else:
		leksykon.write(caly.encode("utf-8"))
