Attachment 'forest2tiger.py'
Download 1 #!/usr/bin/python -tt
2 # coding: utf-8
3
4 '''
5
6 This script converts treebanks from the Skladnica format to TigerXML format.
7 It doesn't preserve all data since the trees representing semantically
8 incorrect sentence structures are omitted.
9
10 Usage:
11
12 python forest2tiger.py <treebank_directory> [output_file.xml]
13
14 The default output file name is tigerxmloutput.xml
15
16 @author: Lukasz Kozuchowski
17 '''
18
19 import sys
20 import os
21 #import datetime
22 import lxml.etree as ET
23
24
25
26 def usage():
27 '''Print help and exit program.'''
28 print '''usage: python forest2tiger.py <treebank_directory> [<outputfilename>]
29 The default output file name is tigerxmloutput.xml'''
30 exit(0)
31
32 def abort(msg):
33 '''Print error message and exit program.'''
34 print msg
35 print 'Aborting'
36 exit(-1)
37
38 features = [ 'cat', 'tag', 'rodzaj', 'przypadek', 'rodzaj', 'liczba', 'osoba', 'rekcja', 'klasa', 'zap', 'poz', 'neg', 'dest', 'ink', 'orth', 'base', 'wyroznik', 'aspekt', 'czas', 'tryb', 'stopien', 'przyim', 'tfw', 'typ', 'ozn', 'tfz', 'kor', 'akom', 'typc', 'typn', 'kszt' ]
39
40 #random_values_feats = ['orth', 'base', 'tag']
41 terminal_features = set(['orth', 'base', 'tag'])
42
43
44 def skladnica2tigersentence(doctree,unique_id):
45 '''Convert a Skladnica document tree to a TigerXML sentence.'''
46
47 def new_feat_name(fname):
48 ''' A hack - we need attribute names in simple ASCII. '''
49 if fname == u'wyróżnik':
50 return 'wyroznik'
51 elif fname == u'stopień':
52 return 'stopien'
53
54 return fname
55
56
57 s = ET.XML('<s id="'+unique_id+'"></s>')
58 g = ET.XML('<graph root="'+unique_id+'_0"></graph>') #0 is the nid for root in Skladnica
59 s.append(g)
60 terminals = ET.XML('<terminals></terminals>')
61 g.append(terminals)
62
63
64 terminals_list = list(doctree.getiterator('terminal'))
65 # we need to sort terminals by the from attribute (location in sentence)
66 terminals_list.sort(key=lambda x: int(x.getparent().attrib['from']))
67
68 for el in terminals_list:
69 orth = el.find('orth').text
70 orth = orth.replace('"', '"')
71 base = el.find('base').text
72 if base == None: # it happens
73 base = ' '
74 base = base.replace('"', '"')
75 tag = el.find('f[@type="tag"]').text
76
77 terminal_nid = el.getparent().attrib['nid']
78
79 try:
80 terminals.append(ET.XML('<t id="'+unique_id+'_'+terminal_nid+'" orth="'+orth+'" base="'+base+'" tag="'+tag+'" />'))
81 except:
82 print '**** unique_id: ',unique_id,' orth= ', orth, ' tag: ', tag, ' base: ', base, ' ****'
83 raise
84
85
86 nonterminals = ET.XML('<nonterminals></nonterminals>')
87 g.append(nonterminals)
88
89 for el in doctree.getiterator('nonterminal'):
90
91 children = el.getparent().find('children')
92 nonterminal_nid = el.getparent().attrib['nid']
93
94 new_nt = ET.XML('<nt id="'+unique_id+'_'+nonterminal_nid+'"></nt>')
95
96 # "--" stands for "None"/"Blank"
97 for f in features:
98 if f not in terminal_features:
99 new_nt.attrib[f] = '--'
100
101 for f in el.findall('f'):
102 if f.text == '_': f.text = '--' # "--" stands for "None"/"Blank"
103 new_nt.attrib[new_feat_name(f.attrib['type'])] = f.text
104
105 cat = el.find('category').text
106 new_nt.attrib['cat'] = cat
107
108 for c in children.findall('child'):
109 new_nt.append(ET.XML('<edge label="'+
110 ('HD' if c.attrib and c.attrib['head'] == 'true'
111 else '--')
112 +'" idref="'+unique_id+'_'+c.attrib['nid']+'" />'))
113
114 nonterminals.append(new_nt)
115
116 return s
117
118
119
120
121
122
123 def main(argv):
124
125 def not_chosen(el):
126 '''Check if Skladnica tree node was unchosen.'''
127
128 return (el.tag == 'children' and not ('chosen' in el.attrib)) or (el.tag == 'node' and el.attrib['chosen'] != 'true')
129
130
131 def tigerXML_stub():
132 '''Create a TigerXML stub.'''
133 stub = ET.XML('<corpus id="Skladnica-frazowa"></corpus>')
134
135 try:
136 inputfile = open('header.xml', 'r')
137 except:
138 print 'Error reading header.xml'
139 raise
140
141 try:
142 header = ET.XML(inputfile.read())
143 except:
144 print 'Error parsing header.xml'
145 raise
146
147 stub.append(header)
148 stub.append(ET.XML('<body/>'))
149
150 return stub
151
152
153
154
155 if not 2 <= len(argv) <= 3:
156 usage()
157
158 treebank_dir = argv[1]
159 outputfilename = argv[2] if len(argv) == 3 else 'tigerxmloutput.xml'
160
161
162 if not os.path.isdir(treebank_dir):
163 usage()
164
165 print 'Converting ', treebank_dir, 'to ', outputfilename
166
167 try:
168 outputfile = open(outputfilename, 'w')
169 except:
170 abort('Error opening output file: ' + outputfilename)
171
172
173 new_tree = tigerXML_stub()
174 body = new_tree.find('body')
175
176 for root, dirs, files in os.walk(treebank_dir):
177
178 for f in files:
179 filename = os.path.join(root, f)
180
181 try:
182 inputfile = open(filename, 'r')
183 except:
184 abort('Error reading file ' + filename)
185
186 try:
187 doctree = ET.parse(inputfile)
188 except:
189 abort('Error parsing file '+ filename)
190
191 try:
192 valid_forest = doctree.find('answer-data').find('base-answer').attrib['type']
193 except:
194 abort('Error: base-answer type not found in ' + filename)
195
196 if valid_forest != 'FULL':
197 print 'Skipping ', filename ,' (base-answer type != FULL)'
198 continue
199
200 print 'Processing ', filename
201
202 # removing unchosen nodes
203 for el in list(doctree.getiterator()):
204 if not_chosen(el):
205 parent = el.getparent()
206 parent.remove(el)
207
208 treebank_path = os.path.abspath(treebank_dir)
209 unique_sentence_name = ''.join(os.path.relpath(root, treebank_path).split('/'))+'_'+f
210
211 try:
212 new_sentence = skladnica2tigersentence(doctree, unique_sentence_name)
213 except:
214 print 'Error processing ', filename
215 print >> sys.stderr, 'Error processing ', filename
216 raise
217
218 body.append(new_sentence)
219
220
221
222 try:
223 outputfile.write(ET.tostring(new_tree, pretty_print=True))
224 except:
225 abort('Problem writing to ' + outputfilename)
226
227 print '\nConverting finished successfully. Result saved as ', outputfilename
228
229
230
231 # This is the standard boilerplate that calls the main() function.
232 if __name__ == '__main__':
233 main(sys.argv)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.