Locked History Actions

attachment:forest2tiger.py of Składnica

Attachment 'forest2tiger.py'

Download

   1 #!/usr/bin/python -tt
   2 # coding: utf-8
   3 
   4 '''
   5 
   6 This script converts treebanks from the Skladnica format to TigerXML format.
   7 It doesn't preserve all data since the trees representing semantically
   8 incorrect sentence structures are omitted.
   9 
  10 Usage:
  11 
  12   python forest2tiger.py <treebank_directory> [output_file.xml]
  13 
  14 The default output file name is tigerxmloutput.xml
  15 
  16 @author: Lukasz Kozuchowski
  17 '''
  18 
  19 import sys
  20 import os
  21 #import datetime
  22 import lxml.etree as ET
  23 
  24 
  25 
  26 def usage():
  27     '''Print help and exit program.'''
  28     print '''usage: python forest2tiger.py <treebank_directory> [<outputfilename>]
  29 The default output file name is tigerxmloutput.xml'''
  30     exit(0)
  31 
  32 def abort(msg):
  33     '''Print error message and exit program.'''
  34     print msg
  35     print 'Aborting'
  36     exit(-1)
  37 
  38 features =  [ 'cat', 'tag', 'rodzaj', 'przypadek', 'rodzaj', 'liczba', 'osoba', 'rekcja', 'klasa', 'zap', 'poz', 'neg', 'dest', 'ink', 'orth', 'base', 'wyroznik', 'aspekt', 'czas', 'tryb', 'stopien', 'przyim', 'tfw', 'typ', 'ozn', 'tfz', 'kor', 'akom', 'typc', 'typn', 'kszt' ]
  39     
  40 #random_values_feats = ['orth', 'base', 'tag']
  41 terminal_features = set(['orth', 'base', 'tag'])
  42 
  43 
  44 def skladnica2tigersentence(doctree,unique_id):
  45     '''Convert a Skladnica document tree to a TigerXML sentence.'''
  46     
  47     def new_feat_name(fname):
  48         ''' A hack - we need attribute names in simple ASCII. '''
  49         if fname == u'wyróżnik':
  50             return 'wyroznik'
  51         elif fname == u'stopień':
  52             return 'stopien'
  53 
  54         return fname
  55 
  56    
  57     s = ET.XML('<s id="'+unique_id+'"></s>')
  58     g = ET.XML('<graph root="'+unique_id+'_0"></graph>') #0 is the nid for root in Skladnica
  59     s.append(g)
  60     terminals = ET.XML('<terminals></terminals>')
  61     g.append(terminals)
  62 
  63     
  64     terminals_list = list(doctree.getiterator('terminal'))
  65     # we need to sort terminals by the from attribute (location in sentence)
  66     terminals_list.sort(key=lambda x: int(x.getparent().attrib['from']))
  67     
  68     for el in terminals_list:
  69         orth = el.find('orth').text
  70         orth = orth.replace('"', '&quot;')
  71         base = el.find('base').text
  72         if base == None: # it happens
  73             base = ' '
  74         base = base.replace('"', '&quot;')
  75         tag = el.find('f[@type="tag"]').text
  76 
  77         terminal_nid = el.getparent().attrib['nid']
  78 
  79         try:
  80             terminals.append(ET.XML('<t id="'+unique_id+'_'+terminal_nid+'" orth="'+orth+'" base="'+base+'" tag="'+tag+'" />'))
  81         except:
  82             print '**** unique_id: ',unique_id,' orth= ', orth,  ' tag: ', tag, ' base: ', base, ' ****'
  83             raise
  84 
  85 
  86     nonterminals = ET.XML('<nonterminals></nonterminals>')
  87     g.append(nonterminals)
  88 
  89     for el in doctree.getiterator('nonterminal'):
  90 
  91         children = el.getparent().find('children')
  92         nonterminal_nid = el.getparent().attrib['nid']
  93         
  94         new_nt = ET.XML('<nt id="'+unique_id+'_'+nonterminal_nid+'"></nt>')
  95 
  96         # "--" stands for "None"/"Blank"
  97         for f in features:
  98             if f not in terminal_features:
  99                 new_nt.attrib[f] = '--'
 100 
 101         for f in el.findall('f'):            
 102             if f.text == '_': f.text = '--' # "--" stands for "None"/"Blank"
 103             new_nt.attrib[new_feat_name(f.attrib['type'])] = f.text
 104 
 105         cat = el.find('category').text
 106         new_nt.attrib['cat'] = cat
 107 
 108         for c in children.findall('child'):
 109             new_nt.append(ET.XML('<edge label="'+
 110                          ('HD' if c.attrib and c.attrib['head'] == 'true'
 111                           else '--')
 112                          +'" idref="'+unique_id+'_'+c.attrib['nid']+'" />'))
 113 
 114         nonterminals.append(new_nt)
 115 
 116     return s
 117 
 118 
 119 
 120 
 121 
 122 
 123 def main(argv):
 124 
 125     def not_chosen(el):
 126         '''Check if Skladnica tree node was unchosen.'''
 127 
 128         return (el.tag == 'children' and not ('chosen' in el.attrib)) or (el.tag == 'node' and el.attrib['chosen'] != 'true')
 129 
 130 
 131     def tigerXML_stub():
 132         '''Create a TigerXML stub.'''
 133         stub = ET.XML('<corpus id="Skladnica-frazowa"></corpus>')
 134 
 135         try:
 136             inputfile = open('header.xml', 'r')
 137         except:
 138             print 'Error reading header.xml'
 139             raise
 140 
 141         try:
 142             header = ET.XML(inputfile.read())
 143         except:
 144             print 'Error parsing header.xml'
 145             raise
 146 
 147         stub.append(header)
 148         stub.append(ET.XML('<body/>'))
 149 
 150         return stub
 151 
 152 
 153 
 154 
 155     if not 2 <= len(argv) <= 3:
 156         usage()
 157 
 158     treebank_dir = argv[1]
 159     outputfilename = argv[2] if len(argv) == 3 else 'tigerxmloutput.xml'
 160 
 161 
 162     if not os.path.isdir(treebank_dir):
 163         usage()
 164 
 165     print 'Converting ', treebank_dir, 'to ', outputfilename
 166 
 167     try:
 168         outputfile = open(outputfilename, 'w')
 169     except:
 170         abort('Error opening output file: ' + outputfilename)
 171 
 172     
 173     new_tree = tigerXML_stub()
 174     body = new_tree.find('body')
 175 
 176     for root, dirs, files in os.walk(treebank_dir):
 177 
 178         for f in files:
 179             filename = os.path.join(root, f)
 180 
 181             try:
 182                 inputfile = open(filename, 'r')
 183             except:
 184                 abort('Error reading file ' + filename)
 185 
 186             try:
 187                 doctree = ET.parse(inputfile)
 188             except:
 189                 abort('Error parsing file '+ filename)
 190 
 191             try:
 192                 valid_forest =  doctree.find('answer-data').find('base-answer').attrib['type']
 193             except:
 194                 abort('Error: base-answer type not found in ' + filename)
 195   
 196             if valid_forest != 'FULL':
 197                 print 'Skipping ', filename ,' (base-answer type != FULL)'
 198                 continue
 199 
 200             print 'Processing ', filename
 201   
 202             # removing unchosen nodes
 203             for el in list(doctree.getiterator()):
 204                 if not_chosen(el):
 205                     parent = el.getparent()
 206                     parent.remove(el)
 207 
 208             treebank_path = os.path.abspath(treebank_dir)
 209             unique_sentence_name = ''.join(os.path.relpath(root, treebank_path).split('/'))+'_'+f            
 210 
 211             try:
 212                 new_sentence = skladnica2tigersentence(doctree, unique_sentence_name)
 213             except:
 214                 print 'Error processing ', filename
 215                 print >> sys.stderr, 'Error processing ', filename
 216                 raise
 217 
 218             body.append(new_sentence)
 219 
 220     
 221 
 222     try:
 223         outputfile.write(ET.tostring(new_tree, pretty_print=True))    
 224     except:
 225         abort('Problem writing to ' + outputfilename)
 226 
 227     print '\nConverting finished successfully. Result saved as ', outputfilename
 228 
 229 
 230 
 231 # This is the standard boilerplate that calls the main() function.
 232 if __name__ == '__main__':
 233     main(sys.argv)

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2014-12-29 14:14:52, 45023.3 KB) [[attachment:Dendrarium.tar.gz]]
  • [get | view] (2014-12-29 14:14:50, 110423.0 KB) [[attachment:Składnica-frazowa-0.5+TigerSearch.iso.bz2]]
  • [get | view] (2014-12-29 14:14:50, 5508.1 KB) [[attachment:Składnica-frazowa-0.5-TigerXML.xml.gz]]
  • [get | view] (2014-12-29 14:14:54, 43688.2 KB) [[attachment:Składnica-frazowa-0.5.tar.bz2]]
  • [get | view] (2014-12-29 14:14:53, 23842.6 KB) [[attachment:Składnica-frazowa-130718.tar.bz2]]
  • [get | view] (2014-12-29 14:14:53, 30879.6 KB) [[attachment:Składnica-frazowa-140813.tar.bz2]]
  • [get | view] (2015-03-26 17:21:23, 31221.4 KB) [[attachment:Składnica-frazowa-150326.tar.bz2]]
  • [get | view] (2015-09-17 15:01:37, 31221.5 KB) [[attachment:Składnica-frazowa-150917.tar.bz2]]
  • [get | view] (2015-11-20 19:11:46, 31224.1 KB) [[attachment:Składnica-frazowa-151120.tar.bz2]]
  • [get | view] (2016-08-02 17:37:52, 31226.4 KB) [[attachment:Składnica-frazowa-160802.tar.bz2]]
  • [get | view] (2016-09-22 14:09:19, 31448.0 KB) [[attachment:Składnica-frazowa-160922.tar.bz2]]
  • [get | view] (2016-12-15 11:34:01, 30608.9 KB) [[attachment:Składnica-frazowa-161214.tar.bz2]]
  • [get | view] (2017-09-25 14:53:38, 31719.6 KB) [[attachment:Składnica-frazowa-170921.tar.bz2]]
  • [get | view] (2017-09-28 16:52:09, 32046.4 KB) [[attachment:Składnica-frazowa-170928.tar.bz2]]
  • [get | view] (2017-10-03 19:02:59, 32252.6 KB) [[attachment:Składnica-frazowa-171003.tar.bz2]]
  • [get | view] (2017-11-06 18:07:10, 34011.2 KB) [[attachment:Składnica-frazowa-171106 (11k).tar.bz2]]
  • [get | view] (2017-12-20 15:15:30, 34754.4 KB) [[attachment:Składnica-frazowa-171220 (11500).tar.bz2]]
  • [get | view] (2018-07-23 14:56:19, 90031.6 KB) [[attachment:Składnica-frazowa-180723.tar.gz]]
  • [get | view] (2019-03-07 15:28:11, 35288.1 KB) [[attachment:Składnica-frazowa-190307.tar.bz2]]
  • [get | view] (2019-04-15 13:49:17, 35696.3 KB) [[attachment:Składnica-frazowa-190415.tar.bz2]]
  • [get | view] (2020-03-19 18:50:18, 35701.0 KB) [[attachment:Składnica-frazowa-200319.tar.bz2]]
  • [get | view] (2023-07-27 10:57:37, 49177.1 KB) [[attachment:Składnica-frazowa-230724.tar.bz2]]
  • [get | view] (2014-12-29 14:14:53, 4.7 KB) [[attachment:Składnica-frazowa.xsd]]
  • [get | view] (2014-12-29 14:14:52, 757.5 KB) [[attachment:Składnica-zależnościowa-0.5.conll.gz]]
  • [get | view] (2014-12-29 14:14:46, 29.1 KB) [[attachment:Słownik-walencyjny.txt.gz]]
  • [get | view] (2014-12-29 14:14:52, 75.7 KB) [[attachment:Wyszukiwarka-drzew-sieciowa.tar.gz]]
  • [get | view] (2014-12-29 14:14:52, 11832.5 KB) [[attachment:automatic_pos_morph_v2.zip]]
  • [get | view] (2014-12-29 14:14:53, 6.6 KB) [[attachment:forest2tiger.py]]
  • [get | view] (2014-12-29 14:14:50, 455.7 KB) [[attachment:Świgra_1.5.zip]]
 All files | Selected Files: delete move to page

You are not allowed to attach a file to this page.