# ================================================================== # Gossamer Threads Module Library - http://gossamer-threads.com/ # # GT::Mail::Parse # Author : Scott Beck # $Id: Parse.pm,v 1.69 2003/07/29 21:51:17 jagerman Exp $ # # Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved. # ================================================================== package GT::Mail::Parse; # =================================================================== # If MIME::Base64 is installed use it - must eval before hand or 5.004_04 # wipes our ISA. my $have_b64 = eval { local $SIG{__DIE__}; require MIME::Base64; import MIME::Base64; if ($] < 5.005) { local $^W; decode_base64('brok'); } 1; }; $have_b64 or *decode_base64 = \>_old_decode_base64; my $use_decode_qp; if ($have_b64 and $MIME::Base64::VERSION >= 2.16 and # Prior versions had decoding bugs defined &MIME::QuotedPrint::decode_qp and ( not defined &MIME::QuotedPrint::old_decode_qp or \&MIME::QuotedPrint::decode_qp != \&MIME::QuotedPrint::old_decode_qp ) ) { $use_decode_qp = 1; } # Pragmas use strict; use vars qw($VERSION $DEBUG $ERRORS $CRLF $CR_LN @ISA); # System modules use Fcntl; # Internal modules use GT::Mail::Parts; use GT::Base; # Inherent from GT::Base for errors and debug @ISA = qw(GT::Base); # Debugging mode $DEBUG = 0; # The package version, both in 1.23 style *and* usable by MakeMaker: $VERSION = substr q$Revision: 1.69 $, 10; # The CRLF sequence: $CRLF = "\n"; # The length of a crlf $CR_LN = 1; # Error messages $ERRORS = { PARSE => "An error occured while parsing: %s", DECODE => "An error occured while decoding: %s", NOPARTS => "Email has no parts!", DEEPPARTS => "Deep recursion dected, email appears to have more than 50 parts!", MALFORMED => "Found (%s) before finding the start of the boundary. Message malformed" }; my %DecoderFor = ( # Standard... '7bit' => 'NBit', '8bit' => 'NBit', 'base64' => 'Base64', 'binary' => 'Binary', 'none' => 'Binary', 'quoted-printable' => 'QuotedPrint', # Non-standard... 'x-uu' => 'UU', 'x-uuencode' => 'UU', ); sub new { # -------------------------------------------------------------------------- # CLASS->new ( # naming => \&naming, # in_file => '/path/to/file/to/parse', # handle => \*FH # ); # ---------------------------------------------- # Class method to get a new object. Calles init # if there are any additional argument. To set # the arguments that are passed to naming call naming # directly. # my $this = shift; my $class = ref $this || $this; my $self = bless { file_handle => undef, parts => [], head_part => undef, }, $class; $self->init(@_) if (@_ > 0); $self->debug("Created new object ($self).") if $self->{_debug} > 1; return $self; } sub init { # -------------------------------------------------------------------------- # $obj->init (%opts); # ------------------- # Sets the options for the current object. # my $self = shift; my $opt = {}; if (@_ == 1 and ref $_[0] eq 'HASH') { $opt = shift } elsif (defined $_[0] and not @_ % 2) { $opt = {@_} } else { return $self->error("BADARGS", "FATAL", "init") } $self->{_debug} = exists($opt->{debug}) ? $opt->{debug} : $DEBUG; for my $m (qw(crlf in_file in_handle in_string)) { $self->$m($opt->{$m}) if defined $opt->{$m}; } } sub crlf { $CRLF = pop || return $CRLF; $CR_LN = length($CRLF); } sub parse { # -------------------------------------------------------------------------- # my $top = $obj->parse; # ---------------------- # Parses the email set in new or init. Also calles init # if there are any arguments passed in. # Returns the top level part object. # my ($self) = @_; # Any additional arguments goto init $self->init(@_) if (@_ > 1); ($self->{string} and ref($self->{string}) eq 'SCALAR') or return $self->error('BADARGS', 'FATAL', "No input was given to parse before parse() was called"); # Recursive function to parse $self->_parse_part(undef, $self->{string}); # parse! # Return top part return $self->{head_part}; } sub parse_head { # -------------------------------------------------------------------------- # my $head = $obj->parse_head; # ---------------------------- # Passes any additional arguments to init. Parses only the top level # header. This saves some overhead if for example all you need to do it # find out who an email is to on a POP3 server. # my ($self, $in) = @_; $in ||= $self->{string}; $in || return $self->error("BADARGS", "FATAL", "No string to parse set!"); # Parse the head return $self->_parse_head($in); } #-------------------------------------------- # Access #-------------------------------------------- sub in_handle { # -------------------------------------------------------------------------- # $obj->in_handle (\*FH); # -------------------- # Pass in a file handle to parse from when parse is called. # my ($self, $value) = @_; if (@_ > 1 and ref $value and defined fileno $value) { read $value, ${$self->{string}}, -s $value; } return $self->{string}; } sub in_file { # -------------------------------------------------------------------------- # $obj->in_file ('/path/to/file'); # -------------------------------- # Pass in the path to a file to parse when parse is called # my $self = shift; my $file = shift; my $io = \do { local *FH; *FH }; open $io, "<$file" or return $self->error("READOPEN", "FATAL", $file, $!); return $self->in_handle($io); } sub in_string { # -------------------------------------------------------------------------- my ($self, $string) = @_; return $self->{string} unless (@_ > 1); if (ref($string) eq 'SCALAR') { $self->{string} = $string; } else { $self->{string} = \$string; } return $self->{string}; } sub size { # -------------------------------------------------------------------------- # my $email_size = $obj->size; # ---------------------------- # Returns the total size of an email. Call this method after the email has # been parsed. # my $self = shift; (@{$self->{parts}} > 0) or return $self->error("NOPARTS", "WARN"); my $size = 0; foreach (@{$self->{parts}}) { $size += $_->size; } return $size; } sub all_parts { # -------------------------------------------------------------------------- # my @parts = $obj->all_parts; # ---------------------------- # Returns a list of all the part object for the current parsed email. # If the email is not multipart this will be just the header part. # return @{shift()->{parts}} } sub top_part { # -------------------------------------------------------------------------- return ${shift()->{parts}}[0]; } #--------------------------------------------- # Internal Methods #--------------------------------------------- sub _parse_head { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parse just the head. Returns the part object. # my ($self, $in) = @_; # Get a new part object my $part = new GT::Mail::Parts; if (ref $in eq 'ARRAY') { $part->extract($in) or return $self->error("PARSE", "WARN", "Couldn't parse head!"); return $part; } $part->extract([map { $_ . $CRLF } split($CRLF => $$in)]) or return $self->error($GT::Mail::Parts::error, 'WARN'); return $part; } sub _parse_part { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parses all the parts of an email and stores them in there parts # object. This function is recursive. # my ($self, $outer_bound, $in, $part) = @_; my $state = 'OK'; # First part is going to be the top level part if (!$part) { $part = new GT::Mail::Parts; $self->{head_part} = $part; } push @{$self->{parts}}, $part; # Get the header for this part my $indx; if (($indx = index($$in, $CRLF)) == 0) { substr($$in, 0, $CR_LN) = ''; } else { $indx = index($$in, ($CRLF . $CRLF)); if ($indx == -1) { $self->debug('Message has no body.') if $self->{_debug}; $indx = length($$in); } $part->extract([map { $_ . $CRLF } split($CRLF => substr($$in, 0, $indx))]) or return $self->error($GT::Mail::Parts::error, 'WARN'); substr($$in, 0, $indx + ($CR_LN * 2)) = ''; } # Get the mime type my ($type, $subtype) = split('/', $part->mime_type); $type ||= 'text'; $subtype ||= 'plain'; if ($self->{_debug}) { my $name = $part->recommended_filename || '[unnamed]'; $self->debug("Type is '$type/$subtype' ($name)"); } # Deal with the multipart type with some recursion if ($type eq 'multipart') { my $retype = (($subtype eq 'digest') ? 'message/rfc822' : ''); # Find the multipart boundary my $inner_bound = $part->multipart_boundary; $self->debug("Boundary is $inner_bound") if $self->{_debug} > 1; defined $inner_bound or return $self->error("PARSE", "WARN", "No multipart boundary in multipart message."); index($inner_bound, $CRLF) == -1 or return $self->error("PARSE", "WARN", "CR or LF in multipart boundary."); # Parse the Preamble $self->debug("Parsing preamble.") if $self->{_debug} > 1; $state = $self->_parse_preamble($inner_bound, $in, $part) or return; chomp($part->preamble->[-1]) if @{$part->preamble}; # Get all the parts of the multipart message my $partno = 0; my $parts; while (1) { ++$partno < 200 or return $self->error('DEEPPARTS', 'WARN'); $self->debug("Parsing part $partno.") if $self->{_debug}; ($parts, $state) = $self->_parse_part($inner_bound, $in, new GT::Mail::Parts) or return; ($state eq 'EOF') and return $self->error('PARSE', 'WARN', 'Unexpected EOF before close.'); $parts->mime_type($retype) if $retype; push(@{$part->{parts}}, $parts); last if $state eq 'CLOSE'; } # Parse the epilogue $self->debug("Parsing epilogue.") if $self->{_debug} > 1; $state = $self->_parse_epilogue($outer_bound, $in, $part) or return; chomp($part->epilogue->[-1]) if @{$part->epilogue} and $state ne 'EOF'; } # We are on a single part else { $self->debug("Decoding single part.") if $self->{_debug} > 1; # Find the encoding for the body of the part my $encoding = $part->mime_encoding || 'binary'; if (!exists($DecoderFor{lc($encoding)})) { $self->debug("Unsupported encoding '$encoding': using 'binary'... \n" . "The entity will have an effective MIME type of \n" . "application/octet-stream, as per RFC-2045.") if $self->{_debug}; $part->effective_type('application/octet-stream'); $encoding = 'binary'; } my $reparse = (("$type/$subtype" eq "message/rfc822")); my $encoded = ""; # If we have boundaries we parse the body to the boundary if (defined $outer_bound) { $self->debug("Parsing to boundary.") if $self->{_debug} > 1; $state = $self->_parse_to_bound($outer_bound, $in, \$encoded) or return; } # Else we would parse the rest of the input stream as the rest of the message else { $self->debug("No Boundries.") if $self->{_debug} > 1; $encoded = $$in; $state = 'EOF'; } # Normal part so we get the body and decode it. if (!$reparse) { $self->debug("Not reparsing.") if $self->{_debug} > 1; $part->{body_in} = 'MEMORY'; my $decoder = $DecoderFor{lc($encoding)}; $self->debug("Decoding part using: " . lc($encoding)) if $self->{_debug}; $part->{data} = ''; my $ref = ''; my $res = $self->$decoder(\$encoded, \$ref); undef $encoded; $res or return; $part->{data} = $ref; } else { # If have an embeded email we reparse it. $self->debug("Reparsing enclosed message.") if $self->{_debug}; my $out = ''; my $decoder = $DecoderFor{lc($encoding)}; $self->debug("Decoding " . lc($encoding)) if $self->{_debug}; my $res = $self->$decoder(\$encoded, \$out); undef $encoded; $res or return; my $p = new GT::Mail::Parts; push @{$part->{parts}}, $p; $self->_parse_part(undef, \$out, $p) or return; } } return ($part, $state); } sub _parse_to_bound { # -------------------------------------------------------------------------- # This method takes a boundary ($bound), an input string ref ($in), and an # output string ref ($out). It will place into $$out the data contained # by $bound, and remove the entire region (including boundary) from $$in. # my ($self, $bound, $in, $out) = @_; # Set up strings for faster checking: my ($delim, $close) = ("--$bound", "--$bound--"); $self->debug("Parsing bounds. Skip until\n\tdelim ($delim)\n\tclose ($close)") if $self->{_debug} > 1; my ($pos, $ret); # Place our part in $$out. $$out = undef; if (defined($pos = index($$in, "$CRLF$delim$CRLF")) and $pos != -1) { $$out = substr($$in, 0, $pos); substr($$in, 0, $pos + length("$CRLF$delim$CRLF")) = ""; $ret = 'DELIM'; } elsif (index($$in, "$delim$CRLF") == 0) { substr($$in, 0, length("$delim$CRLF")) = ""; $$out = ""; $ret = 'DELIM'; } elsif (defined($pos = index($$in, "$CRLF$close$CRLF")) and $pos != -1) { $$out = $$in; substr($$out, -(length($$out) - $pos)) = ''; my $len = (length($$in) - (length("$CRLF$close$CRLF") + $pos)) * -1; if ($len == 0) { $$in = ''; } else { $$in = substr($$in, $len); } $ret = 'CLOSE'; } elsif (index($$in, "$CRLF$close") == (length($$in) - length("$CRLF$close"))) { $$out = substr($$in, 0, length($$in) - length("$CRLF$close")); $$in = ""; $ret = 'CLOSE'; } elsif (index($$in, "$close$CRLF") == 0) { $$out = ""; substr($$in, 0, length("$close$CRLF")) = ""; $ret = 'CLOSE'; } elsif (index($$in, $close) == 0 and (length($$in) == length($close))) { $$out = ""; $$in = ""; $ret = 'CLOSE'; } if (defined $$out) { return $ret; } else { # Broken Email, retype to text/plain $self->{parts}->[$#{$self->{parts}}]->set('content-type' => 'text/plain'); $$out = $$in; return 'CLOSE'; } } sub _parse_preamble { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parses preamble and sets it in part. # my ($self, $inner_bound, $in, $part) = @_; my $loc; my ($delim, $close) = ("--$inner_bound", "--$inner_bound--"); $self->debug("Parsing preamble. Skip until\n\tdelim ($delim)\n\tclose ($close)") if $self->{_debug} > 1; my @saved; $part->preamble(\@saved); my ($data, $pos, $len); if (index($$in, "$delim$CRLF") == 0) { $data = ''; substr($$in, 0, length("$delim$CRLF")) = ''; } else { $pos = index($$in, "$CRLF$delim$CRLF"); if ($pos >= 0) { $data = substr($$in, 0, $pos); substr($$in, 0, $pos + length("$CRLF$delim$CRLF")) = ''; } elsif ($pos == -1) { return $self->error('PARSE', 'WARN', "Unabel to find opening boundary: " . "$delim\n" . "Message is probably corrupt."); } } push @saved, split $CRLF => $data; undef $data; return 'DELIM'; } sub _parse_epilogue { # -------------------------------------------------------------------------- # Internal Method # --------------- # Parses epilogue and sets it in part. # my ($self, $outer_bound, $in, $part) = @_; my $loc; my ($delim, $close) = ("--$outer_bound", "--$outer_bound--") if defined $outer_bound; $self->debug("Parsing epilogue. Skip until\n\tdelim (" . ($delim || '') . ")\n\tclose (" . ($close || '') . ")") if $self->{_debug} > 1; my @saved; $part->epilogue(\@saved); if (defined $outer_bound) { if ($$in =~ s/(.*?)(?:\A|$CRLF)\Q$delim\E$CRLF//s) { push(@saved, split($CRLF => $1)); $self->debug("Found delim($delim)") if $self->{_debug}; return 'DELIM' } elsif ($$in =~ s/(.*?)(?:\A|$CRLF)\Q$close\E(?:\Z|$CRLF)//s) { push(@saved, split($CRLF => $1)); $self->debug("Found close($close)") if $self->{_debug}; return 'CLOSE' } } push(@saved, split($CRLF => $$in)); $$in = ''; $self->debug("EOF: epilogue is " . length(join '', @saved) . " bytes") if $self->{_debug}; return 'EOF'; } sub Base64 { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; # Remove any non base64 characters. $$in =~ tr{A-Za-z0-9+/}{}cd; # Must pass multiple of 4 to decode_base64. Store any remainder in $rem_str and # pad it with trailing equal signs. my $rem = length($$in) % 4; my ($rem_str); if ($rem) { my $pad = '=' x (4 - $rem); $rem_str = substr($$in, length($$in) - $rem); $rem_str .= $pad; substr($$in, $rem * -1) = ''; } $$out = decode_base64($$in); if ($rem) { $$out .= decode_base64($rem_str); } return 1; } sub Binary { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; $$out = $$in; return 1; } sub NBit { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; $$out = $$in; return 1; } sub QuotedPrint { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; if ($use_decode_qp) { $$out = MIME::QuotedPrint::decode_qp($$in); } else { $$out = $$in; $$out =~ s/\r\n/\n/g; # normalize newlines $$out =~ s/[ \t]+\n/\n/g; # rule #3 (trailing whitespace must be deleted) $$out =~ s/=\n//g; # rule #5 (soft line breaks) $$out =~ s/=([\da-fA-F]{2})/chr hex $1/ge; } return 1; } sub UU { # -------------------------------------------------------------------------- my ($self, $in, $out) = @_; my ($mode, $file); # Find beginning... while ($$in =~ s/^(.+$CRLF)//o) { local $_ = $1; last if ($mode, $file) = /^begin\s*(\d*)\s*(\S*)/; } return $self->error("uu decoding: no begin found", 'WARN') if (!defined($_)); # Decode: while ($$in =~ s/^(.+$CRLF)//o) { local $_ = $1; last if /^end/; next if /[a-z]/; next unless int((((ord() - 32) & 077) + 2) / 3) == int(length($_) / 4); $$out .= unpack('u', $_); } return 1; } sub gt_old_decode_base64 { # -------------------------------------------------------------------------- my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; $str =~ tr|A-Za-z0-9+/| -_|; return "" unless length $str; my $uustr = ''; my ($i, $l); $l = length($str) - 60; for ($i = 0; $i <= $l; $i += 60) { $uustr .= "M" . substr($str, $i, 60); } $str = substr($str, $i); # and any leftover chars if ($str ne "") { $uustr .= chr(32 + length($str)*3/4) . $str; } return unpack("u", $uustr); } 1; __END__ =head1 NAME GT::Mail::Parse - MIME Parse =head1 SYNOPSIS use GT::Mail::Parse my $parser = new GT::Mail::Parse ( naming => \&name_files, in_file => '/path/to/file.eml', debug => 1 ); my $top = $parser->parse or die $GT::Mail::Parse::error; - or - my $parser = new GT::Mail::Parse; open FH, '/path/to/file.eml' or die $!; my $top = $parser->parse ( naming => \&name_files, handle => \*FH, debug => 1 ) or die $GT::Mail::Parse::error; close FH; - or - my $parser = new GT::Mail::Parse; my $top_head = $parser->parse_head ( naming => \&name_files, in_file => '/path/to/file.eml', debug => 1 ) or die $GT::Mail::Parse::error; =head1 DESCRIPTION GT::Mail::Parse is a 100% rfc822 email MIME parser that supports unlimited nested levels of MIME. Emails are parsed into L objects. Each part knows where it's body is and each part contains it's sub parts. See L for details on parts methods. =head2 new - Constructor method This is the constructor method to get a GT::Mail::Parse object, which you need to access all the methods (there are no Class methods). new() takes a hash or hash ref as it's arguments. Each key has an accessor method by the same name except debug, which can only be set by passing debug to new(), parse() or parse_head(). =over 4 =item debug Sets the debug level for this insance of the class. =item naming Specify a code reference to use as a naming convention for each part of the email being parsed. This is useful to keep file IO down when you want the emails seperated into each part as a file. If this is not specified GT::Mail::Parse uses a default naming, which is to start at one and incriment that number for each attachment. The attachments would go in the current working directory. =item in_file Specify the path to the file that contains the email to be parsed. One of in_file and handle must be specified. =item handle Specify the file handle or IO stream that contains the email to be parsed. =back =head2 parse - Parse an email Instance method. Parses the email specified by either in_file or handle. Returns the top level L object. Any additional parameters passed in are treated the same as if they were passed to the constuctor. =head2 parse_head - Parse just the header of the email Instance method. This method is exactly the same as parse except only the top level header is parsed and it's part object returned. This is useful to keep overhead down if you only need to know about the header of the email. =head2 size - Get the size Instance method. Returns the total size in bytes of the parsed unencoded email. This method will return undef if no email has been parsed. =head2 all_parts - Get all parts Instance method. Returns all the parts in the parsed email. This is a flatened list of the objects. Somewhat similar to what MIME::Tools does. All the parts still contain there sub parts. =head1 COPYRIGHT Copyright (c) 2000 Gossamer Threads Inc. All Rights Reserved. http://www.gossamer-threads.com/ =head1 VERSION Revision: $Id: Parse.pm,v 1.69 2003/07/29 21:51:17 jagerman Exp $