#!/usr/bin/env ruby

# This file is a part of TaCo: a morphosyntactic tagset converter
#
# License: GPL v. 3
# Author: Bartosz Zaborowski
#

# tagset converter v0.1 by Bartosz Zaborowski
#

require "#{File.dirname(__FILE__)}/corpus"
require "#{File.dirname(__FILE__)}/helpers"
require "#{File.dirname(__FILE__)}/converter_logic"
require "#{File.dirname(__FILE__)}/configuration"

def load_and_align_corpora srcdir, srcfilename, srctagset, trgdir, trgfilename, trgtagset, serialize

corp = nil
corp2 = nil

if serialize
  begin
    corp = Corpus.deserialize srcdir+".serial"
  rescue
    corp = Corpus.from_xml srcdir, srcfilename, srctagset
    corp.serialize srcdir+".serial"
  end

  begin
    corp2 = Corpus.deserialize trgdir+".serial"
  rescue
    corp2 = Corpus.from_xml trgdir, trgfilename, trgtagset
    corp2.serialize trgdir+".serial"
  end
else
  corp = Corpus.from_xml srcdir, srcfilename, srctagset
  corp2 = Corpus.from_xml trgdir, trgfilename, trgtagset
end

puts "corpora loaded, #{corp.files.length} + #{corp2.files.length} files"

corp.align corp2

puts "corpora aligned: #{corp.files.length} files, #{corp.all_toks.select{|x| x.is_a? Tok}.length} segments, #{corp.all_sents.length} sentences"

return corp
end

def evaluate sents
  correct = 0
  toks = sents.map{|s| s.aligned_with.toks}.flatten.select {|x| x.is_a? Tok}
  toks.each do |t|
    correct +=  1 if t.golden_tag == t.selected_tag
  end

  return toks.length, correct
end

def crossvalidate corp, toolmodule, options = {}, k, tempdir

  begin
    toolmodule.init_threadkeys @key, @other_key
  rescue
  end
  traintime, testtime = 0, 0
  starttime = Time.now

  total_toks = corp.all_toks.select {|x| x.is_a? Tok}.length
  bucketsize = total_toks/k


  buckets = (1..k).to_a.map{|x| []}

  sents = corp.all_sents

  toolmodule::prepare sents

  tokcntr = 0
  sents.each_with_index do |s, i|
    buckets[[tokcntr/bucketsize, buckets.length-1].min] << s
    tokcntr += s.toks.select {|x| x.is_a? Tok}.length
#    buckets[i%k] << s
  end

#  puts buckets.map{|x| x.reduce(0){|a, s| a+s.toks.select{|y| y.is_a? Tok}.length}}.inspect

  total_instances = 0
  total_correct = 0
  total_memory = 0
  k.times do |kth|
    Helpers::DEBUGSTREAM.puts "cross[#{toolmodule::NAME}: #{options.inspect}], fold #{kth} @#{Time.now}", Helpers::INFO
    #split
    train = ((0..k-1).to_a-[kth]).map{|x| buckets[x]}.flatten
    test = buckets[kth]


    trt, tst, mm = toolmodule::do_a_try train, test, options, tempdir
    tot, corr = evaluate test

    total_instances += tot
    total_correct += corr
    total_memory += mm
    traintime += trt
    testtime += tst

    corp.aligned_with.clear_selected_tags
    corp.aligned_with.all_sents.each{|s| s.restore}

    Helpers::DEBUGSTREAM.puts "   -> #{corr.to_f/tot.to_f} correct", Helpers::DEBUG
  end
 
  puts
  puts "#{toolmodule::NAME} @#{total_toks} toks:     #{options.inspect}"+
    "  correctness: #{sprintf("%.4f", total_correct.to_f/total_instances.to_f*100)}%"+
    "  time: #{sprintf("%.4g", Time.now-starttime)}s "+
    "(each fold #{sprintf("%.4f", traintime/k)}s training, #{sprintf("%.4f", testtime/k)}s applying)" +
    "  memory avg: #{total_memory/k}MB"
  return total_correct
end


def usage
  puts "TaCo: morphosyntactic tagset converter - evaluation tool"
  puts "by Bartosz Zaborowski"
  puts "License: GPL v.3"
  puts
  puts "usage:"
  puts "#{__FILE__} <configuration_file> <source_corpus_dir> <source_corpus_filename> <source_corpus_tagset> <target_corpus_dir> <target_corpus_filename> <target_corpus_tagset>"
  puts
  puts "Corpora format supported: XCES."
  puts "Corpora for training must be aligned, that is for every file source_dir/a/b/source_filename.xml in source corpora there must be a file target_dir/a/b/target_filename.xml"
end

if ARGV.empty? || ARGV.find{|x| ["-h", "--help"].include? x}
  usage
  exit
end

TMPDIR = "/tmp"

if ARGV.length != 7
  usage
  exit
end

Configuration.setup ARGV[0]

@corp = load_and_align_corpora ARGV[1], ARGV[2], ARGV[3], ARGV[4], ARGV[5], ARGV[6], Configuration.serialize_corpora_on_evaluation

crossvalidate @corp, ConverterLogic, Configuration.options, Configuration.number_of_folds_on_evaluation, TMPDIR


