#!/bin/sh

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# Written by Theo Van Dinter <felicity@apache.org>
# Please feel free to mail with any questions. :)

# This is a small script used to interact with run-masses to do a full
# corpus mass-check run, including the rsync to the SA server.
#
# NOTE: you MUST set RSYNC_USER and RSYNC_PASSWORD outside of this script so
# that your results can be sent up for review.
#
# By default, it'll do a nightly (set0) run.  If you want to do a weekly
# (set1) run, add "--net" to the commandline.
#
# Defaults for mass-check parameters:
#       no commandline parameters
#		use DEF_AFTER for --after option
#		adds "-n"
#		use nightly-versions.txt for updates
#	--net
#		switches to NET_AFTER for --after option
#		adds "-j 4", "--net", "--reuse"
#		use weekly-versions.txt for updates
#
# ie: "run-corpora" equates to:		"mass-check -n"
#     "run-corpora --net" equates to:	"mass-check -n -j 4 --net --reuse"

# Set the path appropriately
PATH=/bin:/usr/bin:/usr/local/bin
if [ -d /sw/bin ]; then
  PATH=${PATH}:/sw/bin
fi
export PATH

# Where do things live?

# CORPUS is the directory that houses your mail corpus and these scripts.
CORPUS=$HOME/SA/corpus

# SA_VER is the directory that you want updated and will use for the
# nightly/weekly run
SA_VER=$HOME/SA/spamassassin-corpora

# OUTDIR is the directory which will be used for the temporary log files
# during the run, etc.  If you don't know, leave it blank and it'll use
# "SA_VER/masses"
OUTDIR=

# FINALDIR, if set, is where you want the files moved to after processing is
# completed.
FINALDIR=

# These are paths to various programs.  If PATH is set appropriately, you can
# just let the shell find them.
SVN=svn
SVNVERS=svnversion
WGET=wget
RSYNC=rsync

# DEF_AFTER is used for the set0 run --after parameter
# NET_AFTER is used for the set1 run --after parameter
# this needs to be specially handled since there are problems with shell
# parameter parsing that make these not work without it.
#
DEF_AFTER="-120 days"
NET_AFTER="-60 days"



if [ -z "$RSYNC_USER" -o -z "$RSYNC_PASSWORD" ]; then
  echo "You need to specify RSYNC_USER and RSYNC_PASSWORD via the environment!" >&2
  exit 2
fi

if [ -z "$OUTDIR" ]; then
  OUTDIR="$SA_VER/masses"
fi

cd $OUTDIR
if [ -f ham.log -o -f spam.log ]; then
  echo "A previous run still has log files, exiting." >&2
  exit 2
fi

NET=0
OPTS="-n"
VERS=nightly
FILENAME=$RSYNC_USER
umask 002

while [ ! -z "$1" ]; do
  if [ "$1" = "--net" ]; then
    NET=1
  fi
  shift
done

if [ $NET -eq 1 ]; then
  FILENAME="net-$FILENAME"
  OPTS="$OPTS --net --reuse"
  AFTER="$NET_AFTER"
  VERS=weekly

  # We want to do this with more parallelization...
  OPTS="$OPTS -j 4"
else
  AFTER="$DEF_AFTER"
fi

# Verify appropriate version to run with
echo "[Updating $SA_VER]"
COUNT=0
while ! $WGET -q -nd -m http://rsync.spamassassin.org/$VERS-versions.txt ; do
  sleep 60
  COUNT=`expr $COUNT + 1`
  if [ $COUNT -gt 5 ]; then
    echo "Couldn't get the $VERS revision version, aborting!" >&2
    exit 2
  fi
done

NREV=`tail -1 $VERS-versions.txt | awk '{print $2}'`
if [ -f "$SA_VER/masses/svninfo.tmp" ]; then
  CREV=`grep ^Revision: $SA_VER/masses/svninfo.tmp | awk '{print $2}'`
  if [ $CREV -eq $NREV ]; then
    echo "Looks like a problem with the $VERS-versions.txt update, same rev ($CREV)" >&2
    exit 2
  fi
fi

if ! $RSYNC -uaqrC --exclude masses/spamassassin/ --delete rsync.spamassassin.org::tagged_builds/${VERS}_mass_check/ $SA_VER; then
  echo "Couldn't rsync update the $VERS spamassassin corpora code" >&2
  exit 2
fi

CREV=`grep ^Revision: $SA_VER/masses/svninfo.tmp | awk '{print $2}'`
if [ $CREV -ne $NREV ]; then
  echo "Looks like a problem with the rsync area, found rev $CREV but expected rev $NREV" >&2
  exit 2
fi

# do the run, treat $AFTER differently due to commandline parsing and "-"
echo "[Running mass-check '$OPTS' in $CORPUS]"
if [ -z "$AFTER" ]; then
  $CORPUS/run-masses $SA_VER $OPTS > /dev/null
else
  echo "[Using '$AFTER' for --after setting]"
  $CORPUS/run-masses $SA_VER $OPTS --after "$AFTER" > /dev/null
fi

if [ ! -s ham.log -o ! -s spam.log ]; then
  echo "There seems to be a problem with either ham.log or spam.log, aborting!" >&2
  exit 1
fi

# now we have our ham.log and spam.log files...
echo "[Uploading daily corpus logs]"
mv ham.log ham-$FILENAME.log
mv spam.log spam-$FILENAME.log
mv results.log results-$FILENAME.log

if ! $RSYNC -qPcvzb ham-$FILENAME.log spam-$FILENAME.log $RSYNC_USER@rsync.spamassassin.org::corpus/; then
  echo "There was an error during rsync!" >&2
fi

RESULTDIR=.
if [ "$FINALDIR" ]; then
  mv -f ham-$FILENAME.log spam-$FILENAME.log $FINALDIR

  if [ -f "ham-$FILENAME.log" -o -f "spam-$FILENAME.log" ]; then
    echo "There was an error moving files around, aborting!" >&2
    exit 1
  fi

  RESULTDIR=$FINALDIR
  if [ -d "$FINALDIR/hf" ]; then
    RESULTDIR="$FINALDIR/hf"
  fi
  mv -f results-$FILENAME.log $RESULTDIR
fi

echo "[Our results]"
cat $RESULTDIR/results-$FILENAME.log
