#!/usr/bin/ruby

########################################################################
# irc_analyze.rb - generate word distribution statistics from IRC logs #
#                                                                      #
# Usage:                                                               #
#   irc_analyze.rb <input> <output> <nicknames...>                     #
#                                                                      #
# Options:                                                             #
# - input     : input file (or "-" for stdin)                          #
# - output    : output file (or "-" for stdout)                        #
# - nicknames : nicknames the specified person uses                    #
#                                                                      #
# Examples:                                                            #
#   ./irc_analyze.rb - pabs.xml < ~/log/e.log                          #
#                                                                      #
########################################################################

module IRCAnalyze
  class Word
    attr_accessor :sum, :pre, :post, :begin, :end, :value, :variants

    def initialize(word)
      @sum = 0
      @value = word

      @variants = Hash.new
      @variants[@value] = 0

      # the words before and after this word, respectively
      @pre = Hash.new
      @post = Hash.new

      # count of the nubmer of times this word begins and ends a line
      @begins = 0
      @ends = 0
    end

    # increment the count of this word, and the count of the non-normalized
    # variant "val" of it; also possibly increment the previous and next
    # word counters and/or the beginning of phrase or end of phrase counters
    def inc(val, pre = nil, post = nil)
      if pre
        @pre[pre] = (@pre[pre] or 0) + 1
      else 
        @begins += 1
      end

      if post
        @post[post] = (@post[post] or 0) + 1
      else
        @ends += 1
      end

      @sum += 1
      @variants[val] = (@variants[val] or 0) + 1
    end

    def dump_stat(type,val,num)
      "  <stat type=\"#{type}\" value=\"#{val}\" num=\"#{num}\" />\n"
    end
    private :dump_stat

    # dump the stats of this word into the current block
    def dump
      yield "<word value=\"#{@value}\" num=\"#{@sum}\" "
      yield " begins=\"#{@begins}\" ends=\"#{@ends}\">\n"

      @variants.each { |k,v| 
        yield dump_stat("variant", k, v)
      }
      @pre.each { |k,v|
        yield dump_stat("pre", k, v)
      }
      @post.each { |k,v|
        yield dump_stat("post", k, v)
      }

      yield "</word>\n"
    end

    def to_s
      str = ""
      dump() { |line| str += line }
      str
    end

    
    def save(io)
      dump() { |line| io.print line }
    end 

  end

  class Nick
    attr_accessor :debug, :nicks, :words, :associates, :new_words
  
    def initialize(*nicks)
      @debug = false;

      @nicks = nicks
      @words = Hash.new
      @associates = Hash.new
      @new_words = Array.new
    end
  
    # analyze an IO object and append the results to the known stats
    def analyze(io)
      lines = 0
      io.each { |line| 
        lines += 1
        line.gsub!(/\e.*?m/, '')
        process_line(line) if @nicks.find { |x| line =~ /[[<(]#{x}[)>\]]/i }

        # everything after this is debugging output
        if lines % 10000 == 0
          $stderr.print "IRCNick::analyze(): lines = #{lines}\n"
        end

        if @new_words.length >= 100
          $stderr.print "IRCNick::analyze(): sample = "
          (0..9).each {|x| $stderr.print @new_words[x] + ", " }
          $stderr.print "\n"

          @new_words.clear
        end
      }
    end
  
    # process the current line and add it to this users's stats
    def process_line(line)
      $stderr.print "P: " + line if @DEBUG
      words = line.split.reject { |w| @nicks.find { |n| w =~ /#{n}/ } }

      # normalize the string
      orig = words.dup
      words.map! { |w| w.downcase!; w.gsub(/[^a-z0-9\-_]/,''); w }

      words.each_index { |i|
        $stderr.puts words[i] if @debug
        pre, post = words[i - 1], words[i + 1] if words.length > 1
        unless @words[words[i]]
          @words[words[i]] = Word.new(words[i])
          @new_words << words[i]
        end
        @words[words[i]].inc(orig[i], pre, post)
      }
    end

    # convert the output to xml and yield it to the given block
    def dump()
      yield "<person>\n"
      @nicks.each { |x| yield "  <alias value=\"#{x}\" />\n" }
      @words.each { |k,v| yield v.to_s }
      yield "</person>\n"
    end

    # return this person's stats as a string
    def to_s()
      str = "" 
      dump() { |line| str += line }
      str
    end

    # print this person's stats to the given io object
    def save(io)
      dump() { |line| io.print line }
    end

  end # IRCNick
end # IRCAnalyze

# test suite/command-line interface
if $0 == __FILE__
  include IRCAnalyze

  def print_usage
    print "#{$0} in out <nicknames>"
    exit -1;
  end

  print_usage unless ARGV.length >= 3
  in_path, out_path = ARGV.slice!(0..1)
  nick = Nick.new(*ARGV)

  # process input
  if in_path == "-"
    nick.analyze($stdin)
  else
    File.open(in_path) { |io| nick.analyze(io) }
  end

  # dump output
  if out_path == "-"
    nick.dump($stdout)
  else 
    File.open(out_path, "w") { |f| nick.save(f) }
  end
end

