#!/usr/bin/perl

=head1 NAME

seek-phrases-in-log - extract good-looking rules from a text-dump mc log

=cut

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# ---------------------------------------------------------------------------

sub usage {
  die "
usage: seek-phrases-in-log [--reqhitrate n] [--reqpatlength n] [ --maxreqpatlength n]
   [--rules] [--ruletype 'type'] [--ruleprefix FOO]
   [--maxtextread n] --ham hamlog --spam spamlog

--reqhitrate: percentage hit-rate against spam required (default: 0.5)
   (multiple values can be specified, separated by spaces)
--reqpatlength: required pattern length, in characters (default: 0)
--maxreqpatlength: maximum pattern length, in characters (default: 1024)
--maxtextread: bytes of message text examined (default: 32768)
--rules: generate SpamAssassin rule output (default: 0)
--ruleprefix: specify prefix string for rules (default: 'SEEK_')
--ruletype 'type': generate rules of type: 'header', 'body'
";
}

# ---------------------------------------------------------------------------

use warnings;
use strict;
use Getopt::Long qw(:config no_ignore_case);
use Data::Dumper;
use Digest::SHA qw(sha1_base64);

sub logmsg;

my %opt = ();
$opt{reqhitrate} = 0.5;
$opt{reqpatlength} = 0;
$opt{maxreqpatlength} = 1024;
$opt{maxtextread} = 32768;
$opt{rules} = 0;
$opt{ruleprefix} = 'SEEK_';
$opt{ruletype} = 'body';

my $fs;
my $fh;
my @files = ();
GetOptions(
        "rules" => \$opt{rules},
        "ruleprefix=s" => \$opt{ruleprefix},
        "reqhitrate=s" => \$opt{reqhitrate},
        "reqpatlength=s" => \$opt{reqpatlength},
        "maxreqpatlength=s" => \$opt{maxreqpatlength},
        "ruletype=s" => \$opt{ruletype},
        "maxtextread=s" => \$opt{maxtextread},
        "phase2=s" => \$opt{phase2},
        "ham=s" => \$fh,
        "spam=s" => \$fs,
        'help' => \&usage
) or usage();

usage() unless (($fs && $fh) || $opt{phase2});

my @hitratethresholds = ();
if ($opt{reqhitrate} =~ /\S\s+\S/) {        # multiple values
  @hitratethresholds = reverse sort {$a<=>$b} split (' ', $opt{reqhitrate});
  $opt{reqhitrate} = pop @hitratethresholds;  # the lowest
  # @hitratethresholds is now sorted, highest to lowest
}

# n-gram reading state
my %word2sym = ('' => '');
my %sym2word = ('' => '');
my $sym_acc = 'a';      # symbols are represented using IDs from this counter
my $msgcount = 0;

# these are shared with the assembly stage (via $asmstate)
my @text_string = ();
my %ngram_count = ();
my %msg_subset_hit = ();
my $asmstate;

if ($opt{phase2}) {
  $asmstate = load_state($opt{phase2});
}
else {
  logmsg "reading $fs...";
  open IN, "<$fs" or die "cannot open spam log $fs";
  while (<IN>) {
    /^text: (.*)$/ and proc_text_spam($1);
  }
  close IN;

  # only do this if we have read enough spam messages, otherwise we could
  # discard tokens for which reqhitrate has been achieved
  if ($msgcount > 2 * (100 / $opt{reqhitrate})) {
    discard_hapaxes();
  }

  logmsg "n-grams active: ".(scalar keys %ngram_count);

  logmsg "reading $fh...";
  open IN, "<$fh" or die "cannot open ham log $fh";
  while (<IN>) {
    /^text: (.*)$/ and proc_text_ham($1);
  }
  close IN;
  logmsg "n-grams active: ".(scalar keys %ngram_count);

  # move onto the next step; assembly.  free stuff we no longer need here
  undef %word2sym;     # free this, no longer needed
  # create the assembly state object
  $asmstate = { };

  filter_into_message_subsets();
  undef %sym2word;     # no longer needed after that
  write_state($asmstate, "assemble.state");
}

assemble_regexps();
exit;

# ---------------------------------------------------------------------------
# PHASE 1: PARSING, NGRAM PROBABILITIES

sub proc_text_spam {
  my ($text) = @_;

  # we only need to chop off the end of spam samples
  if (length($text) > $opt{maxtextread}) {
    $text = substr $text, 0, $opt{maxtextread};      # chop!
  }

  $text =~ s/  +/ /gs;                  # single spaces, please

  # we only need to save spam samples in memory, ignore ham samples
  push @text_string, $text;

  my $cp = pack "l", $msgcount;
  $msgcount++;

  (($msgcount % 1000) == 999) and discard_hapaxes();

  my %tokens = ();
  foreach my $line (split(/\[p\]/, $text)) {
    my $w1 = '';
    my $w2 = '';
    my $w3 = '';

    foreach my $w (split(' ', $line)) {
      # if (length $w > 20) { $w = "sk:".substr($w, 0, 5); }

      $w3 = $w2;
      $w2 = $w1;

      $w1 = $word2sym{$w};
      if (!$w1) {
        $word2sym{$w} = $w1 = $sym_acc;
        $sym2word{$sym_acc} = $w;
        $sym_acc++;
      }

      # simple bayesian N-grams to start
      $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = 1;
    }

    # deal with leftovers
    if ($w2 && $w1) {
      $tokens{"$w2.$w1"} = 1;
    }
  }

  foreach my $tok (keys %tokens) {
    #warn "JMD adding $tok => ".decode_sym2words($tok);
    # incr the counter for this token
    $ngram_count{$tok}++;
    $msg_subset_hit{$tok} .= $cp;    # the message subset hit by this tok
  }
}

sub discard_hapaxes {
  my $before = (scalar keys %ngram_count);
  foreach my $tok (keys %ngram_count) {
    if ($ngram_count{$tok} <= 1) {
      delete $ngram_count{$tok};
      delete $msg_subset_hit{$tok};
    }
  }
  my $after = (scalar keys %ngram_count);
  my $killed = ($before - $after);
  logmsg "shrunk dbs: $killed hapaxes killed, kept $after entries";
}

sub proc_text_ham {
  my ($text) = @_;

  my %tokens = ();
  foreach my $line (split(/\[p\]/, $text)) {
    my $w1 = '';
    my $w2 = '';
    my $w3 = '';

    foreach my $w (split(' ', $line)) {
      $w3 = $w2;
      $w2 = $w1;

      $w1 = $word2sym{$w};
      if (!$w1) {
        # since we're deleting, there's no need to add new words
        # to the dictionary; just use the final $sym_acc to mean
        # "unknown ham word", and don't increment it
        $w1 = $sym_acc;
      }

      $tokens{"$w3.$w2.$w1"} = $tokens{"$w3.$w2"} = $tokens{".$w3"} = 1;
    }

    # deal with leftovers
    if ($w2 && $w1) {
      $tokens{"$w2.$w1"} = 1;
    }
  }

  foreach my $tok (keys %tokens) {
    # we're not tracking hits; we're killing false positives.
    # as soon as a single FP appears, kill all record of that token,
    # it cannot be used
    remove_fp_ing_token($tok);
  }
}

sub remove_fp_ing_token {
  my $tok = shift;
  #warn "JMD removing $tok => ".decode_sym2words($tok);
  delete $ngram_count{$tok};
  delete $msg_subset_hit{$tok};
  delete $ngram_count{".".$tok};
  delete $msg_subset_hit{".".$tok};
  delete $ngram_count{"..".$tok};
  delete $msg_subset_hit{"..".$tok};
}

sub filter_into_message_subsets {
  logmsg "filtering into message subsets...";

  $asmstate = { };

  # hash all msg_subset_hit lists; we don't need the full data, so this
  # saves space
  foreach my $id (keys %msg_subset_hit) {
    $msg_subset_hit{$id} = unpack("%32C*", $msg_subset_hit{$id});
  }

  # note: we don't care about stuff that hits *any* ham at all
  $asmstate->{msg_count_spam} = scalar @text_string;
  $asmstate->{msg_count_spam} ||= 0.000001;

  $asmstate->{all_patterns_for_set} = { };

  foreach my $id (keys %ngram_count) {
    my $count = $ngram_count{$id};
    my $bad;

    # must occur more than once!
    if (!defined $count || $count <= 1) {
      $bad++;
    }
    # require N% spam hits
    elsif (($count*100) / $asmstate->{msg_count_spam} < $opt{reqhitrate}) {
      $bad++;
    }

    if ($bad) {
      # we don't need to remember anything about this pattern after here
      remove_fp_ing_token($id);
      next;
    }

    my $set = $msg_subset_hit{$id};
    push @{$asmstate->{all_patterns_for_set}->{$set}}, decode_sym2words($id);
  }

  logmsg "message subsets found: ".(scalar
                                keys %{$asmstate->{all_patterns_for_set}});

  $asmstate->{ngram_count} = \%ngram_count;
  $asmstate->{msg_subset_hit} = \%msg_subset_hit;
  $asmstate->{text_string} = \@text_string;
}

sub decode_sym2words {
  my $ids = shift;
  my $r;
  if ($ids =~ /^([^.]*)\.([^.]*)\.([^.]*)$/) {
    $r = "$sym2word{$1} $sym2word{$2} $sym2word{$3}";
  }
  elsif ($ids =~ /^([^.]*)\.([^.]*)$/) {
    $r = "$sym2word{$1} $sym2word{$2}";
  }
  else {
    warn "bad decode_sym2words format: '$ids'";
  }
  $r =~ s/^\s+//;
  return $r;
}

sub write_state {
  my ($state, $statefile) = @_;
  my $dump = Data::Dumper->new ([ $state ]);
  $dump->Deepcopy(1);
  $dump->Purity(1);
  $dump->Indent(1);
  my $text = $dump->Dump.";1;";
  open (OUT, ">$statefile") or die "cannot write $statefile";
  print OUT $text;
  close OUT or die "cannot write $statefile";
}

# ---------------------------------------------------------------------------
# PHASE 2: REGEXP ASSEMBLY

sub load_state {
  my $file = shift;
  if (open(IN, "<".$file)) {
    my $str = join("", <IN>); close IN;
    my $VAR1;                 # Data::Dumper
    if (eval $str) {
      return $VAR1;        # Data::Dumper's naming
    }
  }
  die "failed to load state from $file";
}

sub assemble_regexps {
  my %done_set = ();
  my @done_pats = ();

  logmsg "deduping and assembling regexps...";
  if (!$opt{rules}) {
    printf ("%6s  %6s  %6s   %s\n", "RATIO", "SPAM%", "HAM%", "DATA");
  }

  $| = 1;
  my $count = 0;
  my $count_out = 0;
  foreach my $id (sort {
              $asmstate->{ngram_count}->{$b} <=> $asmstate->{ngram_count}->{$a}
          } keys %{$asmstate->{ngram_count}})
  {
    my $set = $asmstate->{msg_subset_hit}->{$id};
    next if $done_set{$set}; $done_set{$set}++;

    if ($count++ % 200 == 0) {
      logmsg "working on message subset $count ($count_out)...";
    }

    # we now have several patterns.  see if we can expand them sideways
    # to make the pattern bigger, and collapse into a smaller number of
    # pats at the same time
    my @pats = collapse_pats($asmstate->{all_patterns_for_set}->{$set});
    # my @pats = @{$asmstate->{all_patterns_for_set}->{$set}};

    @pats = quote_all_metas(@pats);
    @pats = subsume_with_dotstars(@pats);
    @pats = expand_with_dots(@pats);
    @pats = ensure_reqpatlength(@pats);
    # at this point, patterns are all valid regexps

    # now check to see if any of these pats have been subsumed in an
    # already-output pattern (one with more hits!)
    my @pats_new = ();
    foreach my $pat (@pats) {
      my $subsumed = 0;
      foreach my $done (@done_pats, @pats_new) {
        # pattern == existing pattern, or existing pattern is contained by
        # pattern, or pattern is contained in existing pattern
        if ($pat eq $done || $pat =~ /\Q${done}\E/ || $done =~ /\Q${pat}\E/) { 
          $subsumed=1; 
          last; 
        }
        # or one pattern contains the other (but interpreted as a regexp!)
        # this deals with /foo.{0,10} bar/ vs /foo ish bar/
        if ($pat =~ /$done/) { 
          $subsumed=1; 
          last; 
        }
        if ($done =~ /$pat/) { 
          $subsumed=1; 
          last; 
        }
      }
      next if $subsumed;

      # add to the current list
      push @pats_new, $pat;
    }

    @pats = @pats_new;
    push @done_pats, @pats_new;

    # if we have no non-subsumed pats at this stage, skip this set
    next unless @pats;

    if (($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} <
            $opt{reqhitrate})
    {
      # quit if we hit the final hitrate threshold
      return;
    }

    if (defined($hitratethresholds[0]) &&
        ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam} < $hitratethresholds[0])
    {
      print "# passed hit-rate threshold: $hitratethresholds[0]\n";
      shift @hitratethresholds;
    }

    if ($opt{rules}) {
      printf "# %6.3f  %6.3f  %6.3f\n",
        1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0;

      # sort, to ensure ordering always remains the same
      foreach my $pat (sort @pats) {
        my $name = generate_rule_name($pat);

        if ($opt{ruletype} eq 'header') {
          # deal with header-specific munging.
          # "\[\\n\]" is the result of "[\n]", at this stage
          $pat =~ s/\Q\[\\n\]\E/\\n/gs;
          $pat =~ s/\Q\[\\t\]\E/\\t/gs;
        }

        if ($opt{ruletype} eq 'body') {
          $pat =~ s/^\s/\\s/;
        }

        if ($opt{ruletype} eq 'header') {
          print "$opt{ruletype} $opt{ruleprefix}${name}  ALL =~ /$pat/\n";
        } else {
          print "$opt{ruletype} $opt{ruleprefix}${name}  /$pat/\n";
        }
        $count_out++;
      }

    } else {
      my $pats = '/'.join ('/, /', map { s/\//\\\//gs; $_; } @pats).'/';
      printf "%6.3f  %6.3f  %6.3f  %s\n",
        1.0, ($asmstate->{ngram_count}->{$id}*100) / $asmstate->{msg_count_spam}, 0, $pats;
      $count_out++;
    }
  }
}

sub collapse_pats {
  my $pataryref = $_[0];
  my @ret = ();

  while (1) {
    my $pat = shift(@{$pataryref});
    last unless defined($pat);

    #warn "JMD collapse [$pat]";
    $pat =~ s/^\s+//;

    # TODO: optimise, second-slowest line
    my @hits = grep /\Q$pat\E/, @{$asmstate->{text_string}};

    if (scalar @hits == 0) {
      warn "supposed pattern /$pat/ is 0-hitter";
      warn "JMD strings:\n  ".join("\n  ", @{$asmstate->{text_string}})."\n";
      push @ret, "[BROKEN: 0 hitter]$pat";
      next;
    }

    # we don't have all day!
    my $pat_maxlen = $opt{maxtextread};
    my $s = $hits[0];

    # Now, expand the pattern using a BLAST-style algorithm

    # expand towards start of string
    while (1) {
      my $l = length($pat);
      last if ($l > $pat_maxlen);     # too long

      my $found;
      if ($s =~ /(.)\Q$pat\E/s && $s !~ /\[p\]\Q$pat\E/s) { $found = $1; }

      if (!defined $found) {
        # start of string.  break
        last;
      }

      # give up if there are a differing number of hits for the new pat
      my $newpat = $found.$pat;

      # note: we can search just in @hits, instead of in
      # @{$asmstate->{text_string}}, because there's no way we can
      # *increase* the hitrate in the corpus; we can only reduce it.
      # this is double-checked after these 2 expansion loops anyway
      if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }

      $pat = $newpat;     # and carry on
    }
    # warn "JMD $pat";

    # expand towards end of string
    while (1) {
      if (length($pat) > $pat_maxlen
         || $s =~ /\Q$pat\E\[p\]/s
         || $s !~ /\Q$pat\E(.)/s)
      {
        # end of string.  break
        last;
      }

      my $newpat = $pat.$1;
      if (scalar (grep /\Q$newpat\E/, @hits) != scalar @hits) { last; }

      $pat = $newpat;     # and carry on
    }

    # double-check to ensure we haven't somehow INCREASED our hitrate
    # beyond the initial pattern
    if (scalar (grep /\Q$pat\E/, @{$asmstate->{text_string}}) != scalar @hits) {
      die "oops! went too far";
    }

    # now remove subsumed patterns
    @{$pataryref} = grep { $pat !~ /\Q$_\E/s } @{$pataryref};

    # warn "JMD $pat";
    # skip recording this if it's already inside one of the results
    next if grep { $_ =~ /\Q$pat\E/s } @ret;

    # also, remove cases where this pattern contains previous results
    @ret = grep { $pat !~ /\Q$_\E/s } @ret;

    # warn "JMD $pat";
    push (@ret, $pat);
  }

  return @ret;
}

sub quote_all_metas {
  return map {
    s/([!-+\`\?\^\~\\\/\|\.\{\}\(\)\[\]\@])/\\$1/gs;
    $_;
  } @_;
}

sub subsume_with_dotstars {
  my @working = @_;

  # attempt to resolve . and .*
  my @ret = ();

  while (1) {
    my $p1 = shift(@working);
    last unless defined($p1);

    # TODO: optimise, 4th-slowest line
    my @hits = grep /$p1/, @{$asmstate->{text_string}};

    if (scalar @hits == 0) {
      warn "supposed pattern /$p1/ is 0-hitter";
      push @ret, "[BROKEN: 0 hitter]$p1";
      next;
    }

    foreach my $p2 (@working) {
      next if ($p1 eq $p2);

      DOTLOOP: for my $dotstar
            ( ".", ".?", ".{0,3}", ".{0,5}", ".{0,20}", ".{0,40}" )
      {
        my $newpatcapture = $p1."(".$dotstar.")".$p2;
        my $newpat        = $p1.$dotstar.$p2;

        # drop the attempt if any of the existing hits becomes a miss,
        # or the .* includes a paragraph boundary in any of the hits
        foreach my $t (@hits) {
          next DOTLOOP if ($t !~ /$newpatcapture/);
          next DOTLOOP if ($1 =~ /\[p\]/);
        }
        next if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) != scalar @hits);

        # it works! yay

        # TODO: for /./, see if we can capture the options and
        # construct a /[oO0]/ char class instead
        $p1 = $newpat;
      }
    }

    # skip recording this if it's already inside one of the results
    next if grep { $_ =~ /\Q$p1\E/s } @ret;

    push (@ret, $p1);
  }
  return @ret;
}

sub expand_with_dots {
  my @working = @_;
  my @ret = ();

  my $pat_maxlen = $opt{maxtextread};
  while (1) {
    my $p1 = shift(@working);
    last unless defined($p1);

    # TODO: optimise, 5th-slowest line
    my @hits = grep /$p1/, @{$asmstate->{text_string}};

    if (scalar @hits == 0) {
      warn "supposed pattern /$p1/ is 0-hitter";
      push @ret, "[BROKEN: 0 hitter]$p1";
      next;
    }

    my $s = $hits[0];

    # expand towards end of string (with .)
    while (1) {
      last if (length($p1) >= $pat_maxlen);

      if ($s !~ /$p1\[p\]/s && $s =~ /$p1(.)/s)
      {
        my $extn = $1; quote_all_metas($extn);
        my $newpat = $p1.$extn;

        # TODO: optimise, 3rd-slowest line
        if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
          $p1 = $newpat; next;
        }
      }

      if ($s !~ /$p1.?\[p\]/s && $s =~ /$p1[^\\](.)/s)
      {
        my $extn = $1; quote_all_metas($extn);
        my $newpat = $p1.".".$extn;
        if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
          $p1 = $newpat; next;
        }
      }

      if ($s !~ /$p1.{0,2}\[p\]/s && $s =~ /$p1[^\\][^\\](.)/s)
      {
        my $extn = $1; quote_all_metas($extn);
        my $newpat = $p1."..".$extn;
        if (scalar (grep /$newpat/, @{$asmstate->{text_string}}) == scalar @hits) {
          $p1 = $newpat; next;
        }
      }

      last;
    }

    push (@ret, $p1);
  }

  @ret;
}

sub ensure_reqpatlength {
  my @ret = @_;
  if ($opt{reqpatlength}) {
    @ret = grep { (length($_) >= $opt{reqpatlength}) && (length($_) < $opt{maxreqpatlength}) } @ret;
    return () unless @ret;
  }
  return @ret;
}

sub generate_rule_name {
  my $str = shift;
  $str = sha1_base64($str);
  $str =~ s/^(.{6}).*$/$1/gs;
  $str =~ tr/a-z/A-Z/;
  $str =~ s/[^A-Z0-9]/_/gs;
  return $str;
}

# ---------------------------------------------------------------------------
# used by all phases

sub logmsg {
  warn "".(scalar localtime time).": $_[0]\n";
}
