# -*- coding: utf-8 -*- 

''' 
wspolny modul wykorzystywany przez nakladka na XLE oraz tworzenia leksykonu na podstawie XML-i ze Skladnicy/NKJP
autorzy: bartosz zaborowski (bz233728@students.mimuw.edu.pl), agnieszka patejuk (aep@ipipan.waw.pl)

This program is free software. It comes without any warranty, to
the extent permitted by applicable law. You can redistribute it
and/or modify it under the terms of the Do What The Fuck You Want
To Public License, Version 2, as published by Sam Hocevar. See
http://sam.zoy.org/wtfpl/COPYING for more details.

'''

import re
import string
from morfeusz_nkjp import *
from frame_maker_lib import *

NAGLOWEK_SLOWNIKA = "POLFIE POLISH LEXICON (1.0)"

LEX_CORE = "POLFIE-lex-core-auto.lfg"
LEX_CORE_ALT = "lex-core-auto-alt"

# variables used by Walenty
PATH_TO_WALENTY = "slowal_20140608_gotowe_tymczasowy_sprawdzone_SPLIT_noORargsL.txt"
# to obtain reduced frames, change to True
makereduced = True
# to obtain passive frames, change to True
makepassive = True

debugging = open(PATH_TO_WALENTY+'-debug', 'w')
mwes = open(PATH_TO_WALENTY+'-mwe', 'w')

WalentyFramesByPos = makeValDictByPos(PATH_TO_WALENTY, makereduced, makepassive)

printArgDict(unlikegfdict, PATH_TO_WALENTY+'-stats_unlikes')

for line in sorted(list(set(debug))):
    debugging.write(line+'\n')

for mwe in sorted(fixed):
    mwes.write(mwe+'\n')
# end of variables used by Walenty

SLOWNIK_RZECZOWNIKOW = "dicts2xle/dict-val-nouns"
SLOWNIK_PRZYMIOTNIKOW = "dicts2xle/dict-val-adjs"
SLOWNIK_SKROTOW = "dicts2xle/gfjp_slowskr.pl"
SLOWNIK_COMP = "dicts2xle/compclasses"

valbase = ["fin", "praet", "inf", "pact", "ppas", "pcon", "pant", "impt", "imps", "pred", "ger", "winien", "bedzie"]

oov = []
# oov_brev = []
oov_brev = {}
punct = []
segf = []

log = open('event_log', 'w')

def extraInfo(lst, basename, suffix):
	if len(lst) > 0:
		out = open(basename+"-"+suffix, "w")
		for item in lst:
			out.write(item.encode("utf-8") + "\n")

def extraInfoDct(dct, basename, suffix):
	if len(dct) > 0:
		out = open(basename+"-"+suffix, "w")
		for key in dct:
			out.write(str(dct[key]) + '\t' + key.encode("utf-8") + "\n")

def anySegf(sent):
	for word in sent.split():
		if word.decode("utf-8") in segf:
			return True
	return False

def eliminateSF(infile):
	okfile = open(infile+'-noSEGF', 'w')
	segfile = open(infile+'-SEGF', 'w')
	for line in open(infile, 'r'):
		line = line.strip()
		if len(line) != 0:
			if anySegf(line) == False:
				okfile.write(line+'\n\n')
			else:
				segfile.write(line+'\n\n')

def makeValDict(dictfile):
	dictfile = open(dictfile, 'r')
	outdict = {}
	for line in dictfile:
		line = line.decode("utf-8")
		if line[0] != '(':
			continue
		baza = line.strip()[1:-1].split()[1].lower().split('_')[0]
		if outdict.has_key(baza):
			outdict[baza].append("@"+line.strip())
		else:
			outdict[baza] = ["@"+line.strip()]
	dictfile.close()
	return outdict

valnoun = makeValDict(SLOWNIK_RZECZOWNIKOW)
valadj = makeValDict(SLOWNIK_PRZYMIOTNIKOW)

def makeCompDict(file):
	dct = {}
	for line in open(file, 'r'):
		line = line.strip().decode('utf-8')
		parts = line.split('\t')
		compclass = parts[0]
		members = parts[1].split()
		for m in members:
			dct[m] = compclass
	return dct

compdict = makeCompDict(SLOWNIK_COMP)

def cprep(pred):
    return "_".join(pred.split())

def stopOrNot(brev, stop):
    if stop == "pun":
        return brev + "."
    else:
        return brev

def splitter(line):
    return line.split('%')[0].strip()[8:-2].split(',')

def makeBrevEntry(line):
    split = splitter(line)
    pred = cprep(split[0][1:-1])
    stop = split[2].strip()
    brev = stopOrNot(split[1][2:-1], stop)
    oblig = split[3].strip()
    tagorg = split[-1].strip()
    entries = []
    if tagorg == "ini":
        tags = "subst:sg:nom.gen.dat.acc.inst.loc:m1.m2.m3.f.n"
    else:
        tags = tagorg
    for tag in list(expand_tags(tags)):
        entries.append(tag)
    return [brev, pred, entries]

def makeBrevDict(path):
    brevdict = {}
    for line in open(path, 'r'):
        line = line.decode("utf-8")
        if not len(line.strip()) == 0:
            if not re.match("%.*", line) and not line == "" and not re.match(".*\?\?\?.*", line):
                expanded = makeBrevEntry(line)
                form = expanded[0]
                full = expanded[1]
                tags = expanded[2]
                if not brevdict.has_key(form):
                    brevdict[form] = [full, []]
                for tag in tags:
                    if tag not in brevdict[form][1]:
                        brevdict[form][1].append(tag)
    return brevdict

brevdict = makeBrevDict(SLOWNIK_SKROTOW)

lex_core = open(LEX_CORE, 'r')

wyjatki = {}
first = True
for line in lex_core:
	if len(line.strip()) == 0:
		continue
	if first:
		first = False
		continue
	if line.strip() == "----":
		break
	l = line.decode("utf-8")
	if l[0] in string.whitespace:
		continue
	wyjatki[l.split()[0]] = True

lex_core.close()

def walencyjnosc(haslo, elements):
	haslo = haslo.lower()
	pos = elements[0]
	tag = elements[1]
	if pos in valbase:
		# Walenty: {{key: {Walenty_schema: {'default': […], 'ppas': […]}}, …}, …}
		dct = WalentyFramesByPos
		entries = []
		types = ['default']
		if pos == 'ppas':
			types = ['ppas']
		if dct.has_key(haslo):
			for schema in dct[haslo]:
				for type in types:
					if type in dct[haslo][schema]:
						for valence in dct[haslo][schema][type]:
							if len(valence.strip()) > 0:
								entries.append(valence)
		if len(entries) > 0:
			return entries
		else:
			if haslo not in oov:
				oov.append(haslo)
	if pos == "subst":
		if valnoun.has_key(haslo):
			return valnoun[haslo]
	if pos == "adj":
		if valadj.has_key(haslo):
			return valadj[haslo]
	return []

def trsl(fulltag):
	# nonagreeing numerals are *NOT* nominative
	if re.match("num:.*:nom:.*:rec", fulltag):
		return trsl(fulltag.replace("nom", "acc"))
	else:
		nowy = []
		for tag in fulltag.split(":"):
			nowy.append(tlumacz(tag))
		return nowy

def tlumacz(tag):
	if tag == "pos":
		return "positive"
	if tag == "com":
		return "comparative"
	if tag == "sup":
		return "superlative"
	if tag == "pri":
		return "1"
	if tag == "sec":
		return "2"
	if tag == "ter":
		return "3"
	if tag in ["n1", "n2", "p2", "p3"]:
		return "n"
	if tag == "p1":
		return "m1"
	else:
		return tag

def choosemacro(elements, base):
	pos = elements[0]
	tags = elements[1:]
	# start of entries which have *only* lexicalised POS macros (rewrite these using a dict)
	if pos == "qub":
		if base in ['nie', 'by', u'się']:
			return pos+"-"+base
		if base in ['niech', 'niechaj', u'niechże', u'niechajże']:
			return pos+"-"+'niech'
		if base in ['czy', u'czyż', u'czyżby', 'azali', u'azaliż', 'li']:
			return pos+"-"+'czy'
	if pos == "num" and base == 'tyle':
		return pos+"-"+base
	if pos == "padj" and base == 'taki':
		return pos+"-"+base
	if pos == "padv" and base == 'tak':
		return pos+"-"+base
	if pos == "psubst" and base == 'to':
		return pos+"-"+base
	# end of entries which have *only* lexicalised POS macros
	if pos == "adv":
		if len(tags) == 1:
			if base == 'bardzo':
				return pos+"-"+base
			return "adv-deg"
	if pos == "ppron12":
		if len(tags) > 4:
			return "ppron-acc"
		else:
			return "ppron"
	if pos == "ppron3":
		if len(tags) == 4:
			return "ppron"
		if len(tags) == 5:
			return "ppron-acc"
		if len(tags) == 6:
			return "ppron-pprep"
	if pos == "prep":
		if len(tags) > 1:
			return "prep-voc"
	if pos == "praet":
		if len(tags) > 3:
			return "praet-aglt"
	# valence of nouns and adjectives
	if pos in ["subst", "adj"]:
		if not walencyjnosc(base, elements) == []:
			return pos+"-core"
	if pos in ["conj", "preconj"]:
		return conjmacro(base, pos)
	if pos == "comp":
		return compmacro(base, compdict)
	return pos

def conjmacro(base, pos):
	if base in ['albo', u'bądź', 'i', 'lub']:
		return pos+'-'+base
	if base in ['ani', 'ni']:
		return pos+'-ani'
	if pos == 'conj':
		if base in ['a', 'ale', 'lecz']:
			return pos+'-a'
		if base in ['czy', 'czyli', 'mianowicie', 'natomiast', 'jak', u'zaś']:
			return pos+'-'+base
		if base == 'oraz':
			return pos+'-i'
	if pos == 'preconj':
		if base in ['tak', u'zarówno', 'nie']:
			return pos+'-'+base
	return pos+'-unknown'

def compmacro(base, dct):
	if dct.has_key(base):
		return 'comp-'+dct[base]
	else:
		return 'comp '+base

def makeHead(macro):
	macro = macro.split()
	macro[0] = macro[0].upper()
	return "@("+" ".join(macro)

def macro(interp):
	base = interp[1]
	elements = trsl(interp[2])
	pos = elements[0]
	head = makeHead(choosemacro(elements, base))
	if pos not in valbase:
		if pos not in ["conj", "comp", "preconj", "qub"]:
			head += " "+base
		if pos == 'qub':
			if base not in ['nie', 'by', u'się', 'niech', 'niechaj', u'niechże', u'niechajże', 'czy', u'czyż', u'czyżby', 'azali', u'azaliż', 'li']:
				head += " "+base
	if len(elements) > 1:
		head += " "
		head += " ".join(elements[1:])
	return head+")"

def cat(interp):
	base = interp[1]
	postag = interp[2].split(':')[0]
	if postag == 'qub':
		if base == 'nie':
			return 'NEG'
		if base == u'się':
			return 'RM'
		if base in ['niech', u'niechże', 'niechaj', u'niechajże', 'by']:
			return 'MM'
		if base in ['czy', u'czyż', u'czyżby', 'azali', u'azaliż', 'li']:
			return 'QUB[int]'
	return postag.upper()

def gfjpcat(dct):
	for key in dct:
		for interp in dct[key]:
			if interp[1] in [u'dopóty', u'póty', 'jednak', 'to', 'przeto', u'więc', 'zatem', u'toteż']:
				if interp[2] == 'conj':
					interp[2]='comp'
			if interp[1] in [u'wówczas', 'wtedy']:
				if interp[2] == 'adv':
					dct[key].append([key, interp[1], 'comp'])
			if interp[1] == 'ten':
			        if interp[2].split(':')[0] == 'adj':
					# adding the PADJ interp (WORKAROUND)
					dct[key].append([key, interp[1], interp[2].replace('adj','padj')])
			if interp[2] == 'conj':
				if interp[1] in ['albo', u'bądź', 'i', 'lub', 'ani', 'ni']:
					# adding the PRECONJ interp
					dct[key].append([key, interp[1], 'preconj'])
				if interp[1] in ['tak', u'zarówno', 'nie']:
					# replacing the CONJ interp with PRECONJ
					interp[2]='preconj'
	
def zapisz_slownik(plik_slownika, dct, razem_z_altem = True):
	leksykon = open(plik_slownika, 'w+')
	leksykon.write(NAGLOWEK_SLOWNIKA + "\n\n")
	if razem_z_altem:
		leksykon_alt = open(LEX_CORE_ALT, 'w+')
		leksykon_alt.write(NAGLOWEK_SLOWNIKA + "\n\n")
	
	# replacing/adding some dictionary entries (mostly conj and comp)
	gfjpcat(dct)
	for slowo in dct.itervalues():
		token = slowo[0][0]
		wpisy = []
		skrotykropkowe = []
		haslozkropka = ""
		for interp in slowo:
			if interp[2] == "interp":
				# attention, check
				if interp[0] not in punct:
					punct.append(interp[0])
			else:
				if interp[1] == None:
					continue
				if interp[2].split(':')[0] == 'brev':
					brevform = stopOrNot(token, interp[2].split(':')[1])
					if brevform == token:
						dest = wpisy
					else:
						dest = skrotykropkowe
						haslozkropka = brevform
					if not brevdict.has_key(brevform):
						brevform = brevform.lower()
					# added: some brev entries are missing (such as a. = albo), these won't be included
					if brevdict.has_key(brevform):
						fullform = brevdict[brevform][0]
						expandedtags = brevdict[brevform][1]
						for exptag in expandedtags:
							newinterp = [brevform, fullform, exptag]
							makeEntry(newinterp, dest)
                                        # some abbreviations have no entry: record them!
                                        else:
                                            if not oov_brev.has_key(brevform):
                                                oov_brev[brevform] = 1
                                            else:
                                                oov_brev[brevform] += 1
				else:
					makeEntry(interp, wpisy)
		if len(wpisy) > 0:
			saveEntries(token, wpisy, leksykon, leksykon_alt, razem_z_altem)
		if len(skrotykropkowe) > 0:
			saveEntries(haslozkropka, skrotykropkowe, leksykon, leksykon_alt, razem_z_altem)
	
	leksykon.write("----\n\n");
	leksykon.close()
	if razem_z_altem:
		leksykon_alt.write("----\n\n");
		leksykon_alt.close()

def formatVal(vallist):
	head = "{ "
	head += "\n|\n".join(vallist)
	return head+" }"

def cutOgonki(text):
	return text.replace(u'ą','a').replace(u'ć','c').replace(u'ę','e').replace(u'ł','l').replace(u'ń','n').replace(u'ó','o').replace(u'ś','s').replace(u'ż','z').replace(u'ź','z')

# some items which *only* have lexicalised macros are put in choosemacro (qub, adv, conj, preconj, comp)
def lexicalised_macro(base, pos):
    if pos in ['qub', 'padj', 'padv', 'psubst', 'prep', 'num', 'adv']:
        if pos == 'padj':
            # removed jakiś and ten (no defined templates)
            if base in ['czyj', 'jaki', u'który']:
                return base
        if pos == 'padv':
            # added którędy, jak (there are defined templates)
            if base in ['czemu', 'dlaczego', u'dokąd', 'gdzie', 'kiedy', 'nigdy', u'odkąd', u'skąd', u'którędy', 'jak']:
                return base
        if pos == 'psubst':
            # removed coś, ktoś (no defined templates)
            if base in ['co', 'cokolwiek', 'kto', 'ktokolwiek', 'nic', 'nikt']:
                return base
        if pos == 'num':
            if base == 'ile':
                return base
        if pos == 'prep':
            if base == 'bez':
                return base

def makeEntry(interp, wpisy):
	wpis = cat(interp)
	wpis += " * "
	posmakro = macro(interp)
        # to avoid problems with invisible entries
	posmakro = posmakro.replace("'", "`'")
	walencje = sorted(set(walencyjnosc(interp[1], [interp[2].split(':')[0], interp[2].split(':')[1:]])))
	if interp[2].split(':')[0] in valbase:
		if len(walencje) == 0:
			warning = 'no (appropriate) valence macros: '+"\t".join(interp)
			log.write(warning.encode("utf-8")+"\n")
	if len(walencje) > 0:
		# handles reduced entries of nouns and adjectives with valence entries
		if re.search(".*-CORE", posmakro):
			reduced = wpis + posmakro.replace("-CORE", "")
			if reduced not in wpisy:
				wpisy.append(reduced)
		if len(walencje) > 1:
			wpis += formatVal(walencje)
		else:
			wpis += walencje[0]
		wpis += "\n\t\t"
	wpis += posmakro
	# adding lexicalised templates (if any)
	base = interp[1]
	pos = interp[2].split(":")[0]
	lexicalised = lexicalised_macro(base, pos)
	if lexicalised:
		wpis += "\n\t\t"
		wpis += "@("+pos.upper()+"-"+lexicalised.upper()+")"
	# to avoid repeated entries
	if wpis not in wpisy:
		wpisy.append(wpis)
	# for AUX entries (CHECK: number, POS for BEDZIE)
	if interp[1] == u'być':
		tag = interp[2].split(":")
		if tag[0] not in ["aglt", "impt"]:
			wpisaux = ""
			if tag[0] == "bedzie":
				wpisaux += "AUX * " + posmakro
			if tag[0] in ["fin", "praet"]:
				# all forms, because TO uses agreeing AUX (ludzie to sa wilki, panstwo to jestem ja)
				wpisaux += "AUX * " + posmakro.replace("@(FIN", "@(FIN-AUX").replace("@(PRAET", "@(PRAET-AUX")
			if tag[0] == "inf":
				wpisaux += "AUX * " + posmakro
			if wpisaux != "":
				wpisy.append(wpisaux)

def saveEntries(token, wpisy, leksykon, leksykon_alt, razem_z_altem):
	if len(wpisy) > 52:
		if token not in segf:
			segf.append(token)
        # to avoid problems with invisible entries
        token = token.replace("'", "`'")
	caly = token+"\t"+wpisy[0]
	for wpis in wpisy[1:]:
		caly += ";\n\t"+wpis
	# to avoid problems with floating point in lemma
	caly = caly.replace(",", "`,")
	caly = caly.replace(".", "`.")
	caly += ".\n\n"
	if wyjatki.has_key(token):
		if razem_z_altem:
			leksykon_alt.write(caly.encode("utf-8"))
	else:
		leksykon.write(caly.encode("utf-8"))
