#### prime-userdict.rb: Library for userdict of PRIME.
#### $Id: userdict.rb,v 1.4 2005/03/07 07:51:33 komatsu Exp $
####
#### Copyright (C) 2003 Hiroyuki Komatsu <komatsu@taiyaki.org>
####     All rights reserved.
####     This is free software with ABSOLUTELY NO WARRANTY.
####
#### You can redistribute it and/or modify it under the terms of 
#### the GNU General Public License version 2.

require 'prime/prime'
require 'prime/taiyaki'
require 'progressbar'

require 'prime/makedict/prime-sary' # For PrimeMakeIndex


class PrimeUserdict
  def initialize(dictname, is_interactive = true)
    @output_file = dictname
    @prime =
      Prime.new(['prime/engine/engine-basic', 'prime/engine/engine-userdict'])

    @index_table = {}
    @pos_table   = {}

    ## FIXME: Create a pos table between literal and pos.
    ## FIXME: <komatsu@taiyaki.org> (2003-12-26)
    @pos_table_literal = {}

    @timestamps  = []
#    @file_option = "w"
    @file_option = (File::CREAT|File::WRONLY|File::APPEND)
    @is_interactive = is_interactive
  end

  def write_pos_table(is_interactive = @is_interactive)
    label = is_interactive ? "POS TABLE" : nil
    File::open(@output_file + "-pos_tmp", "w") {|io_output|
      io_output.chmod(0600)
      if File::file?(@output_file + "-pos") then
	File::open(@output_file + "-pos", "r").each_with_pbar(label) {|line|
	  line.chomp!
	  (pron, *pos_list) = line.split(/\t/)
	  pos_list |= (@pos_table[pron] or [])
	  @pos_table.delete(pron)
	  output_pos(io_output, pron, pos_list)
	}
      end
      @pos_table.each{|pron, pos_list|
	output_pos(io_output, pron, pos_list)
      }
    }
    File::rename(@output_file + "-pos_tmp", @output_file + "-pos")
  end

  def write_timestamps(is_interactive = @is_interactive)
    label = is_interactive ? "TIMESTAMP" : nil
    File::open(@output_file + "-ts", "w") {|io|
      @timestamps.each_with_pbar(label) {|timestamp|
	io.print([timestamp].pack('N'))
      }
    }
  end

  def load_current_timestamps ()
    if File::file?(@output_file + '-ts') then
      File::open(@output_file + '-ts', 'r') {|io|
	string = io.read()
	data_size = string.length / 4
	data_size.times{|i|
	  @timestamps[i] = string[i*4,4].unpack('N').first
	}
      }
    end
  end

  private
  def put_pos_table(key, pos)
    if @pos_table[key].nil? then
      @pos_table[key] = [pos]
    else
      @pos_table[key] |= [pos]
    end
  end

  def set_index(pron, pos, literal, index_no)
    pattern = [pron, pos, literal].join("\t")
    @index_table[pattern] = index_no
  end

  def get_index(pron, pos, literal)
    pattern = [pron, pos, literal].join("\t")
    return @index_table[pattern]
  end

  def output_word_if_new(io, pron, pos, literal)
    unless @prime.check_existence(pron, pos, literal) then
      output_new(io, pron, pos, literal)
    end
  end

  def output_learning(io, pron, pos, literal, index_no, *rest)
    output_line = [index_no, pron, pos, literal, rest].join("\t")
    io.puts(output_line)
  end

  def output_new(io, pron, pos, literal, *rest)
    output_line = [pron, pos, literal, rest].join("\t")
    io.puts(output_line)
  end

  def output_cooccurrence(io, prev_index, pron, pos, literal, index_no)
    output_line = format("%d:%s\t%d:%s\t%d",
			 prev_index, pron, prev_index, literal, index_no)
    io.puts(output_line)
  end

  def output_pos(io, key, *pos_list)
    io.puts([key, pos_list].join("\t"))
  end
end

class PrimeUserdictUpdate < PrimeUserdict
  def initialize(dictname, is_interactive = true)
    super
    @id_alias = []
  end

  def update (input_dict)
    load_current_timestamps()
    update_from_learning(input_dict)
    update_from_co(input_dict)
    write_pos_table()
    write_timestamps()
  end

  def check_necessity (input_dict)
#    if File::size(input_dict) == 0 or File::size(input_dict + '-co') == 0 then
    if File::size(input_dict) == 0 and File::size(input_dict + '-co') == 0 then
      return false
    else
      return true
    end
  end

  def delete_files (input_dict)
#     File::delete(input_dict)
#     File::delete(input_dict + '-co')
    File::rename(input_dict,          input_dict + ".backup")
    File::rename(input_dict + '-co',  input_dict + '-co.backup')
  end

  def update_from_learning (input_dict)
    io_learning = File::open(@output_file,          @file_option)
    io_new      = File::open(@output_file + "-new", @file_option)

    if File::file?(input_dict) then
      File::open(input_dict, 'r').each {|line|
	(id, pron, pos, literal, ts, *rest) = line.chomp.split(/\t/)
	id = id.to_i
	ts = ts.to_i

	index_no = get_index(pron, pos, literal)
	if index_no.nil? then
	  index_no = id
	  set_index(pron, pos, literal, index_no)

	  output_learning(io_learning, pron, pos, literal, index_no, rest)
	  output_word_if_new(io_new, pron, pos, literal)
	  put_pos_table(pron, pos)
	end
	@timestamps[index_no] = [@timestamps[index_no], ts].max()
	@id_alias[id] = index_no
      }
    end
    io_learning.close()
    io_new.close()
  end

  def update_from_co (input_dict)
    File::open(@output_file + '-co', @file_option) {|io_output|
      if File::file?(input_dict + '-co') then
	File::open(input_dict + '-co', 'r').each {|line|
# The following comments are a routine for old format.
# 	(pron_node, literal_node, id) = line.chomp.split(/\t/)
# 	(co_id, *prons) = pron_node.split(':')
# 	pron = prons.join()
# 	(co_id, *literals) = literal_node.split(':')
# 	literal = literals.join()
	  (co_id, pron, literal, id) = line.chomp.split(/\t/)

	  id    = id.to_i
	  co_id = co_id.to_i

	  id    = (@id_alias[id]    or id)
	  co_id = (@id_alias[co_id] or co_id)
	  pos   = nil
	  output_cooccurrence(io_output, co_id, pron, pos, literal, id)
	}
      end
    }
  end
end

class PrimeUserdictConvert < PrimeUserdict
  def initialize (output_file, is_interactive = true)
    super
    @counter_timestamp = Time.new().to_i - 14400
  end
  
  def convert (filename, is_interactive = @is_interactive)
    convert_internal(filename, is_interactive)
    write_data()
  end

  def convert_internal (filename, is_interactive = @is_interactive)
    prev_pos = nil
    prev_index = nil

    io_learning     = open(@output_file,          @file_option)
    io_new          = open(@output_file + "-new", @file_option)
    io_cooccurrence = open(@output_file + "-co",  @file_option)

    title = is_interactive ? filename[-13..-1] : nil
    File::open(filename, 'r').each_with_pbar(title) {|line|
      line.chomp!
      (pron, pos, literal, *rest) = line.split(/\t/)
      if pron.nil? then
	prev_index = nil
	next
      end

      (pos, prev_pos) = set_pos(pos, prev_pos)
      literal = (literal or pron)

      index_no = get_index(pron, pos, literal)
      if index_no.nil? then
	index_no = get_index_new()
	set_index(pron, pos, literal, index_no)

	output_learning(io_learning, pron, pos, literal, index_no, rest)
	output_word_if_new(io_new, pron, pos, literal)
	put_pos_table(pron, pos)
      end
      touch_timestamp(index_no)

      unless prev_index.nil? then
	output_cooccurrence(io_cooccurrence,
			    prev_index, pron, pos, literal, index_no)
      end
      prev_index = index_no
    }

    io_learning.close()
    io_new.close()
    io_cooccurrence.close()
  end

  def write_data(is_interactive = @is_interactive)
    write_pos_table(is_interactive)
    write_timestamps(is_interactive)
    write_index_no(is_interactive)
  end

  def write_index_no(is_interactive = @is_interactive)
    File::open(@output_file + "_index_no", "w") {|io|
      io.print(@timestamps.length)
    }
  end

  private
  def set_pos(pos, prev_pos)
    if pos.nil? or pos.empty? then
      if prev_pos.nil? then
	pos = ""
      else
	pos = "::" + prev_pos
      end
      prev_pos = nil
    else
      prev_pos = pos
    end
    return [pos, prev_pos]
  end

  def get_index_new()
    index = @index_table.length
    return index
  end

  def touch_timestamp(index_no)
    @timestamps[index_no] = @counter_timestamp
    @counter_timestamp += 1
  end
end

class PrimeUserdictMakeIndex < PrimeSaryMakeIndex
  def initialize(dictname, is_interactive = true)
    @dictname = dictname
    @is_interactive = is_interactive
  end

  def make_indexes(is_interactive = @is_interactive)
    make_index_learning(is_interactive)
    make_index_cooccurrence(is_interactive)
    make_index_pos(is_interactive)
  end

end
