#!/usr/bin/perl # # transcode_email.pl # # Transcode e-mail from RFC822 to XML (see email.dtd). # Requires MailTools package from CPAN. # # Created: August 3, 2000 use strict; use Mail::Address; use Mail::Header; use Mail::Internet; use Mail::Util qw(read_mbox); # some initial error checking if ($#ARGV < 0) { print qq{Usage: $0 mboxfilename where mboxfilename is an RFC822-compliant mailbox. }; exit 1; } if (!(-e $ARGV[0])) { print "Error: File $ARGV[0] does not exist.\n"; exit 1; } # declare some variables my ($mail, $head); # Mail objects my (@messages, $message, @body); # local variables # parse mailbox file and print associated XML print q{ }; @messages = read_mbox($ARGV[0]); foreach $message (@messages) { print "\n"; $mail = new Mail::Internet $message; # clean up message body $mail->tidy_body; $mail->unescape_from; # parse and print headers $head = $mail->head; &transcode_headers($head->header_hashref); # parse and print body print "\n"; @body = &transcode_body(@{$mail->body}); print join("", @body); print "\n"; # parse and print MIME attachments ### [parse and print MIME attachments here] print "\n"; } # fini exit 0; ### functions # transcode_headers # pre: # $header is a hash reference returned by the Mail::Header # object. # post: # Prints transcoded headers as XML. sub transcode_headers { my ($header) = @_; my ($subject); print "\n"; &print_addresses('from', @{$header->{From}}); &print_addresses('to', @{$header->{To}}); &print_addresses('cc', @{$header->{Cc}}); ### [do date parsing here] $subject = join("", @{$header->{Subject}}); $subject =~ s/\n//g; $subject =~ s/\r//g; print " $subject\n"; ### [include other headers here] print "\n"; } # print_addresses # pre: # Takes the mail header type (i.e. from, to, cc) and a list of # address strings as parameters. # post: # Parses raw strings into name and email components, and prints # XML to stdout. sub print_addresses { my ($header, @addresses) = @_; my ($address, @parsed_addresses, $parsed_address); my ($name, $email); foreach $address (@addresses) { @parsed_addresses = Mail::Address->parse($address); foreach $parsed_address (@parsed_addresses) { print " <$header>\n"; $name = $parsed_address->name; $email = $parsed_address->address; if (defined $name) { print " $name\n"; } if (defined $email) { print " $email\n"; } print " \n"; } } } # transcode_body # pre: # Takes the body of an e-mail message as a parameter. The body # should be represented as an array of lines. # post: # Returns transcoded body as an array of lines. sub transcode_body { my (@body) = @_; my ($line, $prevline, @newbody); my $sid = 0; my $S_newparagraph = 0; foreach $line (@body) { # replace reserved XML characters w/ entities $line =~ s/\&/\&\;/g; $line =~ s//\>\;/g; # transcode paragraphs if ($line eq "\n") { if (!$S_newparagraph) { $prevline = pop @newbody; $prevline =~ s/$/<\/p>/; push @newbody, $prevline; $sid++; } $S_newparagraph = 1; } elsif ($S_newparagraph) { $line =~ s/^/

/; $S_newparagraph = 0; } push @newbody, $line; } # add paragraph tags at the beginning and end of the body $newbody[0] =~ s/^/

/; $newbody[$#newbody] =~ s/$/<\/p>/; # return transcoded body return @newbody; }