#!/usr/bin/perl

#my $email_to = 'pds@apache.org';
my $email_to = 'ruleqa@spamassassin.apache.org';

use strict;
use Getopt::Long;
our ( $corpusdir );
GetOptions(
    "dir=s" => \$corpusdir,
);

use File::Path;
use File::Copy;
use Time::ParseDate;
use Cwd;
use POSIX qw(nice strftime);

nice(15);

my %revision = ();
my %logs_by_rev = ();
my %is_net_revision = ();
my %dateline = ();
my %time = ();
my @files;
my $time_start = time;
my %revision_date = ();
my %before_nine = ();

my $delete_weekly = 60*60*24*9;
my $delete_nightly = 60*60*24*3;

&rename_corpus;
&read_files;
&cleanup_old;
&email_beforenine;

sub rename_corpus {
  opendir(CORPUS, $corpusdir);
  my @rfiles = sort readdir(CORPUS);
  closedir(CORPUS);

  @rfiles = grep {
    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/ && !(/\.r[0-9]+\.log$/) && -f "$corpusdir/$_" && -M _ < 10
  } @rfiles;

  foreach my $file (@rfiles) {
    my $rev;
    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
    while (my $line = <FILE>) {
      last if $line !~ /^#/;
      if ($line =~ m/^# Date:\s*(\S+)/) {
        my $date_line = $1;
        my ($yyyy, $mm, $dd, $h, $m, $s) = $date_line =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;

        my $timet = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} ${h}:${m}:${s} GMT+0",
                  GMT => 1, PREFER_PAST => 1);

        my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
                  GMT => 1, PREFER_PAST => 1);

        if ($timet < $timetgt) {
          $before_nine{$file} = $timet;
        }
      }
      if ($line =~ m/^# SVN revision:\s*(\S+)/) {
        $rev = $1;
      }
    }

    close(FILE);

    if ($rev) {
      my $newfile = $file;
      $newfile =~ s/\.log$/.r$rev.log/;
      rename("$corpusdir/$file", "$corpusdir/$newfile");
    }

  }
}

sub read_files {
  opendir(CORPUS, $corpusdir);
  @files = sort readdir(CORPUS);
  closedir(CORPUS);

  @files = grep {
    /^(?:spam|ham)-(?:net-)?[-\w]+\.r[0-9]+\.log$/ && -f "$corpusdir/$_" && -M _ < 10
  } @files;

  foreach my $file (@files) {
    open(FILE, "$corpusdir/$file") or warn "cannot read $corpusdir/$file";
    while (my $line = <FILE>) {
      last if $line !~ /^#/;
      if ($line =~ m/^# Date:\s*(\S+)/) {
        $dateline{$file} = $1;
        # if time line unparseable (localized?) use this instead
        my ($yyyy, $mm, $dd, $h, $m, $s) = $dateline{$file} =~ /(\d\d\d\d)(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)Z/;

        my $timetgt = Time::ParseDate::parsedate("${yyyy}/${mm}/${dd} 09:00:00 GMT+0",
                  GMT => 1, PREFER_PAST => 1);

        $time{$file} = $timetgt;
      }
      if ($line =~ m/^# SVN revision:\s*(\S+)/) {
        my $rev = $1;
        $revision{$file} = $rev;

        $logs_by_rev{$rev} ||= [ ];
        push (@{$logs_by_rev{$rev}}, $file);

        if ($file =~ /-net-/) {
          $is_net_revision{$rev} = 1;
        }
      }
    }
    if ($time{$file} && $revision{$file}) {
      my $rev = $revision{$file};
      $revision_date{$rev} = $time{$file} unless defined $revision_date{$rev};

      # set earliest file that has this revision

      if ($time{$file} < $revision_date{$rev}) {
        $revision_date{$rev} = $time{$file};
      }
    }
    close(FILE);
  }
}

sub cleanup_old {
  my @cleanup = ();

  foreach my $revision (keys %revision_date) {
    # set target date based on if net rev
    my $target_date = ($time_start - $delete_nightly);
    $target_date = ($time_start - $delete_weekly) if $is_net_revision{$revision};
    # add all files to cleanup arr
    if ($revision_date{$revision} < $target_date) {
      push(@cleanup, @{$logs_by_rev{$revision}})
    }
  }

  my @cleanup = map "$corpusdir/$cleanup[$_]", 0..$#cleanup;

  unlink($_) foreach @cleanup;
}

sub email_beforenine {
  my $size = keys %before_nine;
  return unless $size;

  my $from = 'automc@sa-vm.apache.org';
  my $subject = '[corpus-cleanup] Early runners';
  my $message = "The following files were submitted by early runners:\n\n";
  foreach my $revision (keys %before_nine) {
    my $time = strftime("%F %R:%S %z", gmtime($before_nine{$revision}));
    $message .= "$revision - Started at $time\n";
  }
  $message .= "\nPlease run automasscheck after 0900 UTC";
  open(MAIL, "|/usr/sbin/sendmail -t");

  # Email Header
  print MAIL "To: $email_to\n";
  print MAIL "From: $from\n";
  print MAIL "Subject: $subject\n";
  print MAIL "MIME-Version: 1.0\n";
  print MAIL "Content-Type: text/plain; charset=UTF-8\n";
  print MAIL "Content-Transfer-Encoding: 8bit\n";
  print MAIL "\n";
  # Email Body
  print MAIL $message;

  close(MAIL);

}

