
alexmv at bestpractical
Apr 30, 2012, 6:03 PM
Views: 103
Permalink
|
|
rt branch, 4.0/encoded-word-whitespace, created. rt-4.0.5-136-g4843514
|
|
The branch, 4.0/encoded-word-whitespace has been created at 484351407de29b9417c4646f362ee10ab1138487 (commit) - Log ----------------------------------------------------------------- commit 5a5f0af5299f5b17f74889751801eb1e88a259a5 Author: Alex Vandiver <alexmv [at] bestpractical> Date: Mon Apr 30 20:52:36 2012 -0400 Remove all whitespace separating adjacent encoded-word chunks RFC 2047 says: When displaying a particular header field that contains multiple 'encoded-word's, any 'linear-white-space' that separates a pair of adjacent 'encoded-word's is ignored. Implement this by explicitly removing all such whitespace. diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm index cadf7cc..a277e56 100644 --- a/lib/RT/I18N.pm +++ b/lib/RT/I18N.pm @@ -293,18 +293,24 @@ sub DecodeMIMEWordsToEncoding { $str = MIME::Field::ParamVal->parse($str)->stringify; } + # Pre-parse by removing all whitespace between encoded words + my $encoded_word = qr/ + =\? # =? + ([^?]+?) # charset + (?:\*[^?]+)? # optional '*language' + \? # ? + ([QqBb]) # encoding + \? # ? + ([^?]+) # encoded string + \?= # ?= + /x; + 1 while $str =~ s/($encoded_word)\s+($encoded_word)/$1$5/; + # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our # custom MIME word decoding and charset canonicalization. We can't do this # until we parse before decode, instead of the other way around. my @list = $str =~ m/(.*?) # prefix - =\? # =? - ([^?]+?) # charset - (?:\*[^?]+)? # optional '*language' - \? # ? - ([QqBb]) # encoding - \? # ? - ([^?]+) # encoded string - \?= # ?= + $encoded_word ([^=]*) # trailing /xgcs; diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t index 7515e2c..7fc1c93 100644 --- a/t/mail/mime_decoding.t +++ b/t/mail/mime_decoding.t @@ -1,7 +1,7 @@ #!/usr/bin/perl use strict; use warnings; -use RT::Test nodb => 1, tests => 9; +use RT::Test nodb => 1, tests => 11; use_ok('RT::I18N'); @@ -97,3 +97,20 @@ diag q{canonicalize mime word encodings like gb2312}; ); } + +diag q{Whitespace between encoded words should be removed}; +{ + my $str = "=?utf-8?Q?=E3=82=AD?= =?utf-8?Q?=E3=83=A3?="; + is( + RT::I18N::DecodeMIMEWordsToUTF8($str), + "キャ", + "whitespace between encoded words is removed", + ); + + $str = "=?utf-8?Q?=E3=82=AD?= \n =?utf-8?Q?=E3=83=A3?="; + is( + RT::I18N::DecodeMIMEWordsToUTF8($str), + "キャ", + "newlines between encoded words also removed", + ); +} commit 484351407de29b9417c4646f362ee10ab1138487 Author: Alex Vandiver <alexmv [at] bestpractical> Date: Mon Apr 30 20:54:02 2012 -0400 Join all adjacent QP encoded-word chunks, in case octets were split Though not allowed by RFC 2047, some mail clients split multiple octets of a multibyte encoding across different encoded-word chunks. Reassemble adjacent encoded-word chunks into one, as long as they match in encoding, language, and transfer encoding. Only quoted-printable chunks are joined, as joining base64 hunks is much more complex. diff --git a/lib/RT/I18N.pm b/lib/RT/I18N.pm index a277e56..1b0bf6c 100644 --- a/lib/RT/I18N.pm +++ b/lib/RT/I18N.pm @@ -306,6 +306,12 @@ sub DecodeMIMEWordsToEncoding { /x; 1 while $str =~ s/($encoded_word)\s+($encoded_word)/$1$5/; + # Also merge quoted-printable sections together, in case multiple + # octets of a single encoded character were split between chunks. + # Though not valid according to RFC 2047, this has been seen in the + # wild. + 1 while $str =~ s/(=\?[^?]+\?[Qq]\?)([^?]+)\?=\1([^?]+)\?=/$1$2$3?=/i; + # XXX TODO: use decode('MIME-Header', ...) and Encode::Alias to replace our # custom MIME word decoding and charset canonicalization. We can't do this # until we parse before decode, instead of the other way around. diff --git a/t/mail/mime_decoding.t b/t/mail/mime_decoding.t index 7fc1c93..7bd2c86 100644 --- a/t/mail/mime_decoding.t +++ b/t/mail/mime_decoding.t @@ -1,7 +1,7 @@ #!/usr/bin/perl use strict; use warnings; -use RT::Test nodb => 1, tests => 11; +use RT::Test nodb => 1, tests => 13; use_ok('RT::I18N'); @@ -114,3 +114,28 @@ diag q{Whitespace between encoded words should be removed}; "newlines between encoded words also removed", ); } + +diag q{Multiple octets split across QP hunks are correctly reassembled}; +{ + # This passes even without explicit code to handle it because utf8 + # is perl's internal string encoding. + my $str = "=?utf-8?Q?=E3?= =?utf-8?Q?=82?= =?utf-8?Q?=AD?="; + is( + RT::I18N::DecodeMIMEWordsToUTF8($str), + "キ", + "UTF8 character split in three is successfully reassembled", + ); + + # Non-utf8 encodings thus also must be checked + $str = <<EOT; chomp $str; +=?gb2312?q?Chinese(gb2312)=20=20=C3=C0=B9=FA=C7=B0=CB=BE=B7=A8=B2=BF=B3?= + =?gb2312?q?=A4=C3=E6=BC=FB=C8=F8=B4=EF=C4=B7=BA=F3=B3=C6=C6=E4=D7=B4=CC=AC?= + =?gb2312?q?=BA=DC=BA=C3=20=20Chinese=20(gb2312)?= +EOT + is( + RT::I18N::DecodeMIMEWordsToUTF8($str), + "Chinese(gb2312) 美国前司法部长面见萨达姆后称其状态很好 Chinese (gb2312)", + "gb2312 character is successfully reassembled", + ); + +} -----------------------------------------------------------------------
|