#!/usr/bin/python -tt
# coding: utf-8

'''

This script converts treebanks from the Skladnica format to TigerXML format.
It doesn't preserve all data since the trees representing semantically
incorrect sentence structures are omitted.

Usage:

  python forest2tiger.py <treebank_directory> [output_file.xml]

The default output file name is tigerxmloutput.xml

@author: Lukasz Kozuchowski
'''

import sys
import os
#import datetime
import lxml.etree as ET



def usage():
    '''Print help and exit program.'''
    print '''usage: python forest2tiger.py <treebank_directory> [<outputfilename>]
The default output file name is tigerxmloutput.xml'''
    exit(0)

def abort(msg):
    '''Print error message and exit program.'''
    print msg
    print 'Aborting'
    exit(-1)

features =  [ 'cat', 'tag', 'rodzaj', 'przypadek', 'rodzaj', 'liczba', 'osoba', 'rekcja', 'klasa', 'zap', 'poz', 'neg', 'dest', 'ink', 'orth', 'base', 'wyroznik', 'aspekt', 'czas', 'tryb', 'stopien', 'przyim', 'tfw', 'typ', 'ozn', 'tfz', 'kor', 'akom', 'typc', 'typn', 'kszt' ]
    
#random_values_feats = ['orth', 'base', 'tag']
terminal_features = set(['orth', 'base', 'tag'])


def skladnica2tigersentence(doctree,unique_id):
    '''Convert a Skladnica document tree to a TigerXML sentence.'''
    
    def new_feat_name(fname):
        ''' A hack - we need attribute names in simple ASCII. '''
        if fname == u'wyróżnik':
            return 'wyroznik'
        elif fname == u'stopień':
            return 'stopien'

        return fname

   
    s = ET.XML('<s id="'+unique_id+'"></s>')
    g = ET.XML('<graph root="'+unique_id+'_0"></graph>') #0 is the nid for root in Skladnica
    s.append(g)
    terminals = ET.XML('<terminals></terminals>')
    g.append(terminals)

    
    terminals_list = list(doctree.getiterator('terminal'))
    # we need to sort terminals by the from attribute (location in sentence)
    terminals_list.sort(key=lambda x: int(x.getparent().attrib['from']))
    
    for el in terminals_list:
        orth = el.find('orth').text
        orth = orth.replace('"', '&quot;')
        base = el.find('base').text
        if base == None: # it happens
            base = ' '
        base = base.replace('"', '&quot;')
        tag = el.find('f[@type="tag"]').text

        terminal_nid = el.getparent().attrib['nid']

        try:
            terminals.append(ET.XML('<t id="'+unique_id+'_'+terminal_nid+'" orth="'+orth+'" base="'+base+'" tag="'+tag+'" />'))
        except:
            print '**** unique_id: ',unique_id,' orth= ', orth,  ' tag: ', tag, ' base: ', base, ' ****'
            raise


    nonterminals = ET.XML('<nonterminals></nonterminals>')
    g.append(nonterminals)

    for el in doctree.getiterator('nonterminal'):

        children = el.getparent().find('children')
        nonterminal_nid = el.getparent().attrib['nid']
        
        new_nt = ET.XML('<nt id="'+unique_id+'_'+nonterminal_nid+'"></nt>')

        # "--" stands for "None"/"Blank"
        for f in features:
            if f not in terminal_features:
                new_nt.attrib[f] = '--'

        for f in el.findall('f'):            
            if f.text == '_': f.text = '--' # "--" stands for "None"/"Blank"
            new_nt.attrib[new_feat_name(f.attrib['type'])] = f.text

        cat = el.find('category').text
        new_nt.attrib['cat'] = cat

        for c in children.findall('child'):
            new_nt.append(ET.XML('<edge label="'+
                         ('HD' if c.attrib and c.attrib['head'] == 'true'
                          else '--')
                         +'" idref="'+unique_id+'_'+c.attrib['nid']+'" />'))

        nonterminals.append(new_nt)

    return s






def main(argv):

    def not_chosen(el):
        '''Check if Skladnica tree node was unchosen.'''

        return (el.tag == 'children' and not ('chosen' in el.attrib)) or (el.tag == 'node' and el.attrib['chosen'] != 'true')


    def tigerXML_stub():
        '''Create a TigerXML stub.'''
        stub = ET.XML('<corpus id="Skladnica-frazowa"></corpus>')

        try:
            inputfile = open('header.xml', 'r')
        except:
            print 'Error reading header.xml'
            raise

        try:
            header = ET.XML(inputfile.read())
        except:
            print 'Error parsing header.xml'
            raise

        stub.append(header)
        stub.append(ET.XML('<body/>'))

        return stub




    if not 2 <= len(argv) <= 3:
        usage()

    treebank_dir = argv[1]
    outputfilename = argv[2] if len(argv) == 3 else 'tigerxmloutput.xml'


    if not os.path.isdir(treebank_dir):
        usage()

    print 'Converting ', treebank_dir, 'to ', outputfilename

    try:
        outputfile = open(outputfilename, 'w')
    except:
        abort('Error opening output file: ' + outputfilename)

    
    new_tree = tigerXML_stub()
    body = new_tree.find('body')

    for root, dirs, files in os.walk(treebank_dir):

        for f in files:
            filename = os.path.join(root, f)

            try:
                inputfile = open(filename, 'r')
            except:
                abort('Error reading file ' + filename)

            try:
                doctree = ET.parse(inputfile)
            except:
                abort('Error parsing file '+ filename)

            try:
                valid_forest =  doctree.find('answer-data').find('base-answer').attrib['type']
            except:
                abort('Error: base-answer type not found in ' + filename)
  
            if valid_forest != 'FULL':
                print 'Skipping ', filename ,' (base-answer type != FULL)'
                continue

            print 'Processing ', filename
  
            # removing unchosen nodes
            for el in list(doctree.getiterator()):
                if not_chosen(el):
                    parent = el.getparent()
                    parent.remove(el)

            treebank_path = os.path.abspath(treebank_dir)
            unique_sentence_name = ''.join(os.path.relpath(root, treebank_path).split('/'))+'_'+f            

            try:
                new_sentence = skladnica2tigersentence(doctree, unique_sentence_name)
            except:
                print 'Error processing ', filename
                print >> sys.stderr, 'Error processing ', filename
                raise

            body.append(new_sentence)

    

    try:
        outputfile.write(ET.tostring(new_tree, pretty_print=True))    
    except:
        abort('Problem writing to ' + outputfilename)

    print '\nConverting finished successfully. Result saved as ', outputfilename



# This is the standard boilerplate that calls the main() function.
if __name__ == '__main__':
    main(sys.argv)
