#!/usr/bin/env ruby

# This file is a part of TaCo: a morphosyntactic tagset converter
#
# License: GPL v. 3
# Author: Bartosz Zaborowski
#

require "#{File.dirname(__FILE__)}/corpus"
require "#{File.dirname(__FILE__)}/helpers"
require "#{File.dirname(__FILE__)}/converter_logic"
require "#{File.dirname(__FILE__)}/configuration"

def train_model modelname, dir1, filename1, tagsetname1, dir2, filename2, tagsetname2, options = {}, tmpdir
  starttime = Time.now

  corp = Corpus.from_xml dir1, filename1, tagsetname1
  puts "source corpus loaded"
  corp2 = Corpus.from_xml dir2, filename2, tagsetname2

  puts "target corpus loaded"

  corp.align corp2

  puts "corpora aligned"

  sents = corp.all_sents

  model = ConverterLogic::train_model sents, options, tmpdir

  puts "model trained, elapsed time: #{Time.now-starttime} s"

  File.open(modelname, "w") do |f|
    f.write Marshal::dump(model)
  end

  puts "model saved"
end


def apply_model modelname, dir1, filename1, tagsetname1, filename2, tagsetname2
  starttime = Time.now

  corp = Corpus.from_xml dir1, filename1, tagsetname1
  puts "source corpus loaded"

  corp2 = corp.clone_untagged filename2, tagsetname2
  
  puts "target corpus initialized"

  sents = corp.all_sents

  model = nil
  File.open(modelname, "r") do |f|
    model = Marshal::load f.read
  end

  puts "model loaded, tagging..."

  ConverterLogic::apply_model model, sents

  puts "corpus tagged, elapsed time: #{Time.now-starttime} s"

  corp2.save

  puts "target corpus saved"
end

def usage
  puts "TaCo: morphosyntactic tagset converter"
  puts "by Bartosz Zaborowski"
  puts "License: GPL v.3"
  puts
  puts "usage:"
  puts "training:\n#{__FILE__} -t <configuration_file> <model> <source_corpus_dir> <source_corpus_filename> <source_corpus_tagset> <target_corpus_dir> <target_corpus_filename> <target_corpus_tagset>"
  puts
  puts "tagging:\n#{__FILE__} <configuration_file> <model> <source_corpus_dir> <source_corpus_filename> <source_corpus_tagset> <target_corpus_filename> <target_corpus_tagset>"
  puts
  puts "Corpora format supported: XCES."
  puts "Corpora for training must be aligned, that is for every file source_dir/a/b/source_filename.xml in source corpora there must be a file target_dir/a/b/target_filename.xml"
  puts "While tagging target files are placed along corresponding source files."
end

if ARGV.empty? || ARGV.find{|x| ["-h", "--help"].include? x}
  usage
  exit
end

if ARGV[0] == "-t"
  if ARGV.length != 9
    usage
    exit
  end
  Configuration.setup ARGV[1]
  train_model ARGV[2], ARGV[3], ARGV[4], ARGV[5], ARGV[6], ARGV[7], ARGV[8], Configuration.options, Configuration.tmpdir
else
  if ARGV.length != 7 
    usage
    exit
  end
  Configuration.setup ARGV[0]
  apply_model ARGV[1], ARGV[2], ARGV[3], ARGV[4], ARGV[5], ARGV[6]
end


