#!/usr/local/bin/perl

# $Id: stripmail.pl,v 1.2 2005/11/16 10:04:59 rej Exp $
#
# Simple mail filter designed as a filter for printing mail messages.
# Removes all headers except for  To, Cc, Bcc, Subject, From and Date.
# If the body has type 
#  text/html		runs it through lynx to remove html'ery
#  application/msword	runs it through antiword
# Also deals with recursive mail (e.g. a mail message 
# contains a mail message that contains a word attachment)
#
# Richard Jones 2005
# 


use Mail::Box::Manager;
use FileHandle;
use Data::Dumper;
use strict;

my $msg = Mail::Message->read(\*STDIN);
print "To: " . $msg->get('to') . "\n";
print "Cc: " . $msg->get('cc') . "\n" if $msg->cc;
print "Bcc: " . $msg->get('bcc') . "\n" if $msg->bcc;
print "Subject: " . $msg->subject . "\n";
print "From: " . $msg->get('from') . "\n";
print "Date: " . $msg->get('date') . "\n";
print "----\n";

# Would be better to remove HTML'ery
my $multi = $msg->isMultipart;

# NB. could also use obj->foreachComponent(CODE) to execute the CODE 
# for each component of the message: the preamble, the epilogue, and 
# each of the parts.
if ($multi) {
  #warn "Multipart";
  my @parts = $msg->parts('RECURSE');
  for(my $i = 0; $i < @parts; $i++) {
    printf("%c", 12) unless $i == 0; # NP
    my $body = $parts[$i]->decoded;
    doPart(\$body);
  }
} else {
  #warn "Not Multipart";
  my $body = $msg->decoded;
  doPart(\$body);
}

sub doPart ($) {
  my $partRef = shift;
  my $type = lc($$partRef->mimeType);

  # if it's HTML, run it through lynx
  my $str = $$partRef->string;
  my $txt;
  if ($type eq 'text/html') {
    $txt = &html2text(\$str);
  }
  # if it's word, run it through antiword
  elsif ($type eq 'application/msword') {
    $txt = &word2text(\$str);
  }
  # guess that it's word
  elsif ($type eq 'application/octet-stream' and
         $$partRef->dispositionFilename() =~ /\.doc$/i) {
    warn $$partRef->dispositionFilename() . " looks like a Word document\n";
    warn "If it's not, you probably want to kill this job now.\n";
    $txt = &word2text(\$str);
  }
  # it's printable text
  elsif ($$partRef->isText) {
    $txt = '';
  }
  # Don't print it if it's binary
  else {
    warn "I'm skipping this component, MIME type: $type\n";
    return;
  }

  if (length($txt)) {
    print $txt;
  } else {
    $$partRef->print;
  }
}

# Convert html to text
sub html2text {
  my $rhtml = shift;  
  if (ref($rhtml) ne 'SCALAR') { return ''; }
  my $LYNX = 'lynx';

  my $body='';
  my $tmp_file = "/tmp/stripmail.$$.tmp";
  my $h_out=new FileHandle;
  my $h_in=new FileHandle;
  
  if (open($h_out, "| $LYNX -stdin -dump -nolog -noredir -localhost -nolist -width=76 1>$tmp_file 2>/dev/null")) {
    print $h_out $$rhtml;
    close $h_out;
    if (-r $tmp_file && open($h_in, "< $tmp_file")) {
      my @newtext_arr=<$h_in>;
      close($h_in);
      unlink $tmp_file;
      my $newtext=join('', @newtext_arr);
      if (length($newtext)) {
        $body = $newtext . "\n[HTML converted to text by lynx]\n";
      }
    }
  }
  return $body;
}

# Convert word to text
sub word2text {
  my $rhtml = shift;  
  if (ref($rhtml) ne 'SCALAR') { return ''; }
  my $ANTIWORD = 'antiword';
  
  my $body='';
  my $doc_file = "/tmp/stripmail.$$.doc";
  my $tmp_file = "/tmp/stripmail.$$.tmp";

  my $h_out=new FileHandle;
  my $h_in=new FileHandle;

  open ($h_out, ">$doc_file");
  print $h_out $$rhtml;
  close $h_out;
  system ("$ANTIWORD -i 0 -t $doc_file 1>$tmp_file 2>/dev/null");
  unlink $doc_file;
  if (-r $tmp_file && open($h_in, "< $tmp_file")) {
    my @newtext_arr=<$h_in>;
    close($h_in);
    unlink $tmp_file;
    my $newtext=join('', @newtext_arr);
    if (length($newtext)) {
      $body = $newtext . "\n[Word converted to text by antiword]\n";
    }
  }
  return $body;
}
