Attachment 'parse.py'
Download 1 import getopt
2 import glob
3 import os
4 import sys
5 import stanza
6 from stanza.resources.common import load_resources_json
7 from stanza.utils.conll import CoNLL
8
9
10 def parse(input_file, output_file):
11 with open(input_file) as f:
12 doc = nlp(f.read())
13 CoNLL.write_doc2conll(doc, output_file)
14
15
16 language = "pl"
17 use_pretokenized_text = False
18 options = "hpl:"
19 long_options = ["help", "pretokenized", "language="]
20 processors = "tokenize, pos, lemma, depparse"
21
22 try:
23 opts, args = getopt.getopt(sys.argv[1:], options, long_options)
24 for opt, arg in opts:
25 if opt in ("-h", "--help"):
26 pass
27 elif opt in ("-p", "--pretokenized"):
28 use_pretokenized_text = True
29 elif opt in ("-l", "--language"):
30 language = arg
31 except getopt.error as err:
32 print(f"Error: {str(err)}")
33 exit(1)
34
35 if args:
36 input_files = []
37 for arg in args:
38 input_files.extend(glob.glob(arg))
39
40 stanza.download(language)
41 resources = load_resources_json()
42 if 'ner' in resources[language]:
43 processors += ", ner"
44
45 nlp = stanza.Pipeline(lang = language, processors=processors, tokenize_pretokenized=use_pretokenized_text)
46 for f_in in input_files:
47 if os.path.isfile(f_in):
48 dir_name, file_name = os.path.split(f_in)
49 file_name = os.path.splitext(file_name)[0] + '.conllu'
50 f_out = os.path.join(dir_name, file_name)
51 parse(f_in, f_out)
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.