try:
  import morfeusz2
  imported_Morfeusz = True
except ImportError:
  err_msg='''
  This model uses the morfeusz2 python module for tokenization, tagging and lemmatization.
  We couldn't find it installed on this machine.
  For best performance, please install the morfeusz2 Binding for python from here:
  
  http://morfeusz.sgjp.pl/download/en
  
          '''
  print(err_msg)
  imported_Morfeusz = False
from spacy.tokens import Doc

class PolishPreProcessor():
  # this class uses morfeusz2 library to work properly
  # it performs tokenization, lemmatization, tagging and morphological analysis
  def __init__(self, nlp):
    if imported_Morfeusz:
      self.morf = morfeusz2.Morfeusz(generate = False, whitespace=morfeusz2.KEEP_WHITESPACES, expand_tags=True)
    self.imported_Morfeusz = imported_Morfeusz
    self.nlp = nlp
    self.tagger = self.nlp.tagger
    self.vocab = self.nlp.vocab
    self.tag_map = {'adj': 'ADJ',
                    'adja': 'ADJ',
                    'adjc': 'ADJ',
                    'adjp': 'ADJ',
                    'adv': 'ADV',
                    'aglt': 'AUX',
                    'bedzie': 'VERB',
                    'brev': 'X',
                    'burk': 'ADV',
                    'comp': 'SCONJ',
                    'conj': 'CCONJ',
                    'depr': 'NOUN',
                    'fin': 'VERB',
                    'ger': 'NOUN',
                    'imps': 'VERB',
                    'impt': 'VERB',
                    'inf': 'VERB',
                    'interj': 'INTJ',
                    'interp': 'PUNCT',
                    'num': 'NUM',
                    'numcol': 'NUM',
                    'pact': 'VERB',
                    'pant': 'VERB',
                    'pcon': 'VERB',
                    'ppas': 'VERB',
                    'ppron12': 'PRON',
                    'ppron3': 'PRON',
                    'praet': 'VERB',
                    'pred': 'VERB',
                    'prep': 'ADP',
                    'qub': 'PART',
                    'siebie': 'PRON',
                    'subst': 'NOUN',
                    'winien': 'VERB',
                    'xxx': 'X',
                    #additional tags besides nkjp
                    'part' : 'PART',
                    'ign' : 'X',
                    'dig' : 'NUM',
                    'romandig' : 'NUM',
                    'frag' : 'X',
                    'pacta' : 'VERB',
                    'numcomp' : 'NUM'}


  def disambiguate_tokenization(self, analyses):
    # The logic of this function is a bit messy and could be improved
    # Perhaps we should utilize graph paths as returned by Morfeusz2 instead of graph nodes
    # Helper functions:
    def renumerate_analysis(analysis, target_start):
      # we assume that analyses have span 1 only
      new_analysis = (target_start, target_start+1, analysis[2])
      return new_analysis

    # Creating a token_number -> [analyses] dictionary
    exceptions = {"Coś":('Co', 'ś'), "Ktoś":('Kto', 'ś'), "Kogoś":("Kogo", "ś"), "Kiedyś":('Kiedy', 'ś'), "Gdzieś":('Gdzie', 'ś'),
                "coś":('co', 'ś'), "ktoś":('kto', 'ś'), "kogoś":("kogo", "ś"), "kiedyś":('kiedy', 'ś'), "gdzieś":('gdzie', 'ś')}
    # There are some frequently used words which can be segmented in principle, but this happens very rarely
    # We always choose the more finegrained tokenization, with exception of the words listed above
    # This improves tokenization accuracy by 0.1 to 0.5%
    
    position_to_analyses={}
    analyses_to_remove=[]
    max_index = 0
    for a in analyses:
      start_index = a[0]
      end_index = a[1]
      span = end_index - start_index
      max_index = max((max_index, start_index))
      form = a[2][0]
      if span>1:
      # Disambiguation of tokenization
      # let us asssume for now that all differences in segmentation are contained in each other
      # and thus we choose to always oversegment
        if form not in exceptions:
          continue
        else:
          dec1, dec2 = exceptions[form]
          for a2 in analyses:
            if (a2[0] == start_index and a2[2][0] == dec1) or (a2[1] == end_index and a2[2][0] == dec2):
              analyses_to_remove.append(a2) 
      try:
        position_to_analyses[start_index].append(a)
      except KeyError:
        position_to_analyses[start_index]=[a]
    for p in position_to_analyses:
      for a_r in analyses_to_remove:
        if a_r in position_to_analyses[p]:
          position_to_analyses[p].remove(a_r)
    
    # Renumeration - is it needed at all?
    n_position_to_analyses={}
    curr_ind=0
    for ind in range(max_index+1):
      if position_to_analyses[ind] == []:
        continue
      else:
        table = []
        for a in position_to_analyses[ind]:
          renumerated = renumerate_analysis(a, curr_ind)
          table.append(renumerated)
        n_position_to_analyses[curr_ind] = table
        curr_ind += 1
        
    return n_position_to_analyses

  def skip_white_space(self, position_to_analyses):
    # Skipping white-space tokens
    non_white_analysis={}
    words = []
    space_afters = []
    # This can be a form -> analysis dictionary because Morfeusz
    # is not context sensitive, so verbatim repetitions should be analyzed
    # identically. Saves a little bit of time for long documents
    for ind in position_to_analyses:
      entry = position_to_analyses[ind][0]
      form = entry[2][0]
      # Because the tokenization is disambiguated, we assume that all interpretations
      # share the same form
      tagging = entry[2][2].split(':')[0]
      if form == ' ' or tagging == 'sp':
        space_afters[-1]=True
      else:
        space_afters.append(False)
        words.append(form)
        non_white_analysis[form] = [a for a in position_to_analyses[ind]]
    return (non_white_analysis, words, space_afters)

  def disambiguate_morphology(self, matching_analyses):
    # Returns only those features, which are constant in all analyses
    morphs_annotations = [((m[3]).split(":"))[1:] for m in matching_analyses]
    aligned = zip(*morphs_annotations)
    matching_features = filter(lambda an: all([a == an[0] for a in an]), aligned)
    agreement_array = [f[0] for f in matching_features]
    disambiguated_string = ":".join(agreement_array)
    return disambiguated_string

  def disambiguate_tags_and_lemmas(self, non_white_analysis, doc):
    # Disambiguation by tagger
    # First we tag the whole document via the tagger
    # and then, in case Morfeusz is ambiguous, select the first analysis which
    # agrees with the tagger
    # if Morfeusz is not ambiguous, we select the option given by Morfeusz
    full_analysis = []
    self.tagger(doc)
    for ind, tok in enumerate(doc):
      form = tok.orth_
      if len(non_white_analysis[form])>1:
        # Morfeusz is ambiguous
        # constructing the list of interpretations which agree with the tagger
        matching_analyses = []
        for i_ind, interp in enumerate(non_white_analysis[form]):
          lemma = interp[2][1].split(':')[0]
          long_tag = interp[2][2]
          nkjp_tag = long_tag.split(':')[0]
          if doc[ind].tag_.lower() != nkjp_tag:
            continue
          ud_tag = self.tag_map[nkjp_tag]
          matching_analyses.append((lemma, nkjp_tag, ud_tag, long_tag))

        if len(matching_analyses) == 0:
          # Morfeusz does not agree with the taggere
          # favouring tagger gives better results
          disambiguated_lemma = form
          disambiguated_nkjp_tag = tok.tag_
          disambiguated_pos_tag = tok.pos_
          disambiguated_morph = ''
        else:
          # Morfeusz does agree with the tagger, we simply choose the first analysis
          disambiguated_lemma = matching_analyses[0][0]
          #disambiguated_long_tag = non_white_analysis[form][0][2][2]
          disambiguated_nkjp_tag = matching_analyses[0][1]
          disambiguated_pos_tag = matching_analyses[0][2]
          disambiguated_morph = self.disambiguate_morphology(matching_analyses)
        
      else:
        # Morfeusz is unambiguous
        unambiguous_analysis = non_white_analysis[form][0]
        long_tag = unambiguous_analysis[2][2].upper()
        if long_tag == 'IGN':
          # Morfeusz does not recognize the word, and thus POS tagger should be favoured
          disambiguated_lemma = form
          disambiguated_nkjp_tag = tok.tag_
          disambiguated_pos_tag = tok.pos_
          disambiguated_morph = ''
        else:
          # Morfeusz recognizes the word and it's not ambiguous so it should be preferred
          # in this case morphology is likely right
          disambiguated_lemma = non_white_analysis[form][0][2][1]
          disambiguated_long_tag = non_white_analysis[form][0][2][2]
          disambiguated_nkjp_tag = disambiguated_long_tag.split(":")[0]
          disambiguated_pos_tag = self.tag_map[disambiguated_nkjp_tag]
          disambiguated_morph = ':'.join(disambiguated_long_tag.split(":")[1:])
          
      full_analysis.append((disambiguated_lemma, disambiguated_nkjp_tag.upper(), disambiguated_pos_tag, disambiguated_morph))
    return full_analysis

  
  def process(self, text):
    analyses = self.morf.analyse(text)
    position_to_analyses = self.disambiguate_tokenization(analyses)
    
    
    non_white_analysis, words, space_afters = self.skip_white_space(position_to_analyses)

    # Saving the tokenization
    doc = Doc(self.vocab, words, space_afters)
    

    full_analysis = self.disambiguate_tags_and_lemmas(non_white_analysis, doc)
    
    for ind, tok in enumerate(doc):
      # these assignments may need to be done by the respective setters of spacy.tokens.token
      # the ordering of assignments is important because of this

#
      doc[ind].tag_ = full_analysis[ind][1]
      morph = full_analysis[ind][3]
      if morph != "":
        doc[ind].tag_+= ":"  + full_analysis[ind][3]
#

      doc[ind].pos_ = full_analysis[ind][2]
      doc[ind].lemma_ = full_analysis[ind][0]
      
      # we also may need to implement the integer coding of features

    
    # assert doc.text == text #can be false
    # nondestructive tokenization seems impossible, because spacy does not distinguish
    # between different types of whitespace while creating the doc

    return doc
  
  def __call__(self, text):
    return self.process(text)
