File: /volume1/@appstore/MailPlus-Server/lib/MIMEDefang/MailPlusServer/ContentScanner.pm
package MailPlusServer::ContentScanner;
use strict;
use warnings;
use Encode;
use MIME::Base64;
use HTML::TokeParser;
use HTML::TokeParser::Simple;
use HTML::Parser;
use MailPlusServer::Log;
use MailPlusServer::Util;
use MailPlusServer::PhishingLinkChecker;
$MailPlusServer::ContentScanner::YES = 'yes';
$MailPlusServer::ContentScanner::NO = 'no';
$MailPlusServer::ContentScanner::DISARM = 'disarm';
sub new {
my $class = shift;
my $config = shift;
my $self = {};
$self->{enable_dangerous_content_scan} = $config->{enable_dangerous_content_scan};
$self->{reject_partial_message} = $config->{reject_partial_message};
$self->{reject_external_message_bodies} = $config->{reject_external_message_bodies};
$self->{convert_html_to_text} = $config->{convert_html_to_text};
$self->{find_phishing_fraud} = $config->{find_phishing_fraud};
$self->{allow_iframe_tags} = $config->{allow_iframe_tags};
$self->{allow_form_tags} = $config->{allow_form_tags};
$self->{allow_script_tags} = $config->{allow_script_tags};
$self->{allow_webbugs} = $config->{allow_webbugs};
$self->{allow_object_codebase_tags} = $config->{allow_object_codebase_tags};
$self->{need_to_scan_html} = 0;
if ($self->{find_phishing_fraud} ||
$MailPlusServer::ContentScanner::YES ne $self->{allow_iframe_tags} ||
$MailPlusServer::ContentScanner::YES ne $self->{allow_form_tags} ||
$MailPlusServer::ContentScanner::YES ne $self->{allow_script_tags} ||
$MailPlusServer::ContentScanner::YES ne $self->{allow_object_codebase_tags} ||
$MailPlusServer::ContentScanner::YES ne $self->{allow_webbugs}) {
$self->{need_to_scan_html} = 1;
}
# Read phishing blacklist and whitelist in the beginning to reduce efforts
MailPlusServer::PhishingLinkChecker::ReadPhishingBlacklistWhitelist();
bless ($self, $class);
}
sub is_partial_message {
my ($self, $entity) = @_;
my $type = $entity->head->mime_attr('content-type');
return ($type && lc($type) eq 'message/partial') ? 1 : 0;
}
sub is_external_message {
my ($self, $entity) = @_;
my $type = $entity->head->mime_attr('content-type');
return ($type && lc($type) eq 'message/external-body') ? 1 : 0;
}
sub is_html_message {
my ($self, $entity) = @_;
my $content_type = $entity->head->mime_attr('content-type');
my $content_disposition = $entity->head->mime_attr('content-disposition');
if ((defined($content_type) && $content_type =~ /text\/html/i) &&
(!defined($content_disposition) || $content_disposition !~ /attachment/i)) {
return 1;
}
return 0;
}
sub scan_mimetype {
my ($self, $entity) = @_;
return ($MailPlusServer::Util::ACCEPT, '') unless $self->{enable_dangerous_content_scan};
if ($self->{reject_partial_message} and $self->is_partial_message($entity)) {
return ($MailPlusServer::Util::REJECT, 'MIME type message/partial not accepted here');
}
if ($self->{reject_external_message_bodies} and $self->is_external_message($entity)) {
return ($MailPlusServer::Util::REJECT, 'MIME type message/external-body not accepted here');
}
return ($MailPlusServer::Util::ACCEPT, '');
}
sub scan {
my ($self, $entity) = @_;
return $MailPlusServer::Util::ACCEPT unless $self->{enable_dangerous_content_scan};
if ($self->{convert_html_to_text}) {
if ($self->strip_html($entity) > 0) {
return $MailPlusServer::Util::ACCEPT_REBUILD;
}
return $MailPlusServer::Util::ACCEPT;
}
# Scan html to disarm tags and do phishing detection
if ($self->{need_to_scan_html}) {
my ($action, $report) = $self->scan_html($entity);
return ($action, $report);
}
return $MailPlusServer::Util::ACCEPT;
}
sub scan_html_entity {
my ($self, $entity) = @_;
my($htmlname_old, $htmlname_new, $fh);
$htmlname_old = $entity->bodyhandle->path();
$htmlname_new = $htmlname_old;
$htmlname_new =~ s/\..?html?$//i;
$htmlname_new .= '_disarm.html';
my %disarm_log_hash = (
'iframe' => 'iframe html tag is not allowed',
'form' => 'form html tag is not allowed',
'script' => 'script html tag is not allowed',
'object' => 'object html tag is not allowed',
'web_bug' => 'web bug is not allowed'
);
my %disarm_item_met_hash = ();
$fh = new FileHandle;
unless ($fh->open(">$htmlname_new")) {
MailPlusServer::Log::ErrorLog('Could not create file %s', $htmlname_new);
return;
}
my $parser = HTML::TokeParser::Simple->new($htmlname_old);
my ($action, $report) = ($MailPlusServer::Util::ACCEPT, '');
# for phishing link check
my $link_checking = 0;
my $link_text = '';
my $link_url = '';
my $link_base_url = '';
# FIXME: need to change warning text ?
my $link_warning_start = '<font color="red"><b>MailPlus Server has detected a possible fraud attempt from';
my $link_warning_end = 'claiming to be</b></font>';
my $object_disarming = 0;
while (my $token = $parser->get_token()) {
if ($token->is_tag('iframe')) {
if ($self->{allow_iframe_tags} eq $MailPlusServer::ContentScanner::NO) {
($action, $report) = ($MailPlusServer::Util::REJECT, $disarm_log_hash{'iframe'});
last;
}
if ($self->{allow_iframe_tags} eq $MailPlusServer::ContentScanner::DISARM) {
print $fh '<iframe_disarmed>' if $token->is_start_tag('iframe');
print $fh '</iframe_disarmed>' if $token->is_end_tag('iframe');
$action = $MailPlusServer::Util::ACCEPT_REBUILD;
$disarm_item_met_hash{'iframe'} = 1;
next;
}
}
if ($token->is_tag('form')) {
if ($self->{allow_form_tags} eq $MailPlusServer::ContentScanner::NO) {
($action, $report) = ($MailPlusServer::Util::REJECT, $disarm_log_hash{'form'});
last;
}
if ($self->{allow_form_tags} eq $MailPlusServer::ContentScanner::DISARM) {
print $fh '<form_disarmed>' if $token->is_start_tag('form');
print $fh '</form_disarmed>' if $token->is_end_tag('form');
$action = $MailPlusServer::Util::ACCEPT_REBUILD;
$disarm_item_met_hash{'form'} = 1;
next;
}
}
if ($token->is_tag('script')) {
if ($self->{allow_script_tags} eq $MailPlusServer::ContentScanner::NO) {
($action, $report) = ($MailPlusServer::Util::REJECT, $disarm_log_hash{'script'});
last;
}
if ($self->{allow_script_tags} eq $MailPlusServer::ContentScanner::DISARM) {
print $fh '<script_disarmed>' if $token->is_start_tag('script');
print $fh '</script_disarmed>' if $token->is_end_tag('script');
$action = $MailPlusServer::Util::ACCEPT_REBUILD;
$disarm_item_met_hash{'script'} = 1;
next;
}
}
if ($token->is_start_tag('object')) {
my $attr_href = $token->get_attr();
my $is_object_codebase = exists $attr_href->{'codebase'} || exists $attr_href->{'data'};
if (!$is_object_codebase) {
print $fh $token->as_is();
next;
}
if ($self->{allow_object_codebase_tags} eq $MailPlusServer::ContentScanner::NO) {
($action, $report) = ($MailPlusServer::Util::REJECT, $disarm_log_hash{'object'});
last;
}
if ($self->{allow_object_codebase_tags} eq $MailPlusServer::ContentScanner::DISARM) {
print $fh '<object_disarmed>' if $token->is_start_tag('object');
$action = $MailPlusServer::Util::ACCEPT_REBUILD;
$object_disarming = 1;
$disarm_item_met_hash{'object'} = 1;
next;
}
} elsif ($token->is_end_tag('object') && $object_disarming) {
print $fh '</object_disarmed>';
$object_disarming = 0;
next;
}
if ($token->is_start_tag('img')) {
my $attr_href = $token->get_attr();
my $is_webbug = exists $attr_href->{'width'} && $attr_href->{'width'} <= 2 &&
exists $attr_href->{'height'} && $attr_href->{'height'} <= 2 &&
exists $attr_href->{'src'} && $attr_href->{'src'} !~ /^cid:/i;
if (!$is_webbug) {
print $fh $token->as_is();
next;
}
if ($self->{allow_webbugs} eq $MailPlusServer::ContentScanner::NO) {
($action, $report) = ($MailPlusServer::Util::REJECT, $disarm_log_hash{'web_bug'});
last;
}
if ($self->{allow_webbugs} eq $MailPlusServer::ContentScanner::DISARM) {
my $spacer_img_base64 = "R0lGODlhAQABAJEAAP///wAAAAAAAAAAACH5BAkAAAAALAAAAAABAAEAAAgEAAEEBAA7";
print $fh "<img src='data:image/gif;base64, $spacer_img_base64' " .
'width="' . $attr_href->{'width'} . '" '.
'height="' . $attr_href->{'height'} . '" '.
'alt="Web Bug from ' . $attr_href->{'src'} . '" />';
$action = $MailPlusServer::Util::ACCEPT_REBUILD;
$disarm_item_met_hash{'web_bug'} = 1;
next;
}
}
if ($self->{'find_phishing_fraud'} && $token->is_start_tag('base')) {
my $attr_href = $token->get_attr();
if (exists($attr_href->{'href'})) {
$link_base_url = $attr_href->{'href'};
}
print $fh $token->as_is();
next;
}
if ($self->{'find_phishing_fraud'} && $token->is_start_tag('a')) {
my $attr_href = $token->get_attr();
print $fh $token->as_is();
if (!exists($attr_href->{'href'})) {
next;
}
$link_checking = 1;
$link_text = '';
$link_url = $attr_href->{'href'};
next;
}
if ($self->{'find_phishing_fraud'} && $token->is_text() && $link_checking) {
$link_text .= $token->as_is();
next;
}
if ($self->{'find_phishing_fraud'} && $token->is_end_tag('a') && $link_checking) {
my $phishing_link = MailPlusServer::PhishingLinkChecker::FetchPhishingLink($link_text, $link_url, $link_base_url, $link_warning_start);
if ($phishing_link ne '') {
print $fh "$link_warning_start \"$phishing_link\" $link_warning_end $link_text";
$action = $MailPlusServer::Util::ACCEPT_REBUILD;
} else {
print $fh "$link_text";
}
print $fh $token->as_is();
$link_checking = 0;
$link_text = '';
$link_url = '';
next;
}
print $fh $token->as_is();
}
foreach my $key (keys %disarm_item_met_hash)
{
$report .= ', ' if $report ne '';
$report .= $disarm_log_hash{$key};
}
$fh->close();
if ($action == $MailPlusServer::Util::ACCEPT_REBUILD) {
$entity->bodyhandle->path($htmlname_new);
}
return ($action, $report);
}
# FIXME: what to do if too many recursive parts ?
sub scan_html {
my ($self, $entity) = @_;
my ($action, $report) = ($MailPlusServer::Util::ACCEPT, '');
return $MailPlusServer::Util::ACCEPT unless $entity && defined($entity->head);
if ($self->is_html_message($entity)) {
($action, $report) = $self->scan_html_entity($entity);
if ($action == $MailPlusServer::Util::REJECT) {
return ($action, $report);
}
}
my(@parts, $part);
@parts = $entity->parts;
foreach $part (@parts) {
my ($part_action, $part_report) = $self->scan_html($part);
if ($part_action == $MailPlusServer::Util::REJECT) {
return ($part_action, $part_report);
}
if ($part_action > $action) {
$action = $part_action;
$report = $part_report;
} elsif ($part_action > $action && $part_report ne '') {
if ($report eq '') {
$report = $part_report;
} else {
$report .= ', ' . $part_report;
}
}
}
return ($action, $report);
}
# Rewritten from MailScanner HTMLEntityToText()
#
# Convert 1 MIME entity from html to text using HTML::Parser.
sub convert_html_entity_to_text {
my($this, $entity) = @_;
my($htmlname, $textname, $textfh, $htmlparser);
# Replace the MIME Content-Type
$entity->head->mime_attr('Content-type' => 'text/plain');
# Replace the filename with a new one
$htmlname = $entity->bodyhandle->path();
$textname = $htmlname;
$textname =~ s/\..?html?$//i; # Remove .htm .html .shtml
$textname .= '.txt'; # This should always pass the filename checks
$entity->bodyhandle->path($textname);
# Create the new file with the plain text in it
$textfh = new FileHandle;
unless ($textfh->open(">$textname")) {
MailPlusServer::Log::ErrorLog('Could not create plain text file %s', $textname);
return;
}
$htmlparser = HTML::TokeParser::MailScanner->new($htmlname);
# Turn links into text containing the URL
$htmlparser->{textify}{a} = 'href';
$htmlparser->{textify}{img} = 'src';
my $inscript = 0;
my $instyle = 0;
while (my $token = $htmlparser->get_token()) {
next if $token->[0] eq 'C';
# Don't output the contents of style or script sections
if ($token->[1] =~ /style/i) {
$instyle = 1 if $token->[0] eq 'S';
$instyle = 0 if $token->[0] eq 'E';
next if $instyle;
}
if ($token->[1] =~ /script/i) {
$inscript = 1 if $token->[0] eq 'S';
$inscript = 0 if $token->[0] eq 'E';
next if $inscript;
}
my $text = $htmlparser->get_trimmed_text();
print $textfh $text . "\n" if $text;
}
$textfh->close();
}
# Rewritten from mimedefang remove_redundant_html_parts()
sub remove_redundant_html_parts {
my ($self, $entity) = @_;
my @parts = $entity->parts;
my $type = lc($entity->mime_type);
# Don't recurse into multipart/signed or multipart/encrypted
return 0 if ($type eq "multipart/signed" or $type eq "multipart/encrypted");
my (@keep, $part);
my $didsomething = 0;
my $have_text_plain = 0;
if ($type eq "multipart/alternative" && $#parts >= 0) {
# First look for a text/plain part
$have_text_plain = 0;
foreach $part (@parts) {
$type = lc($part->mime_type);
if ($type eq "text/plain") {
$have_text_plain = 1;
last;
}
}
# If we have a text/plain part, delete any text/html part
if ($have_text_plain) {
foreach $part (@parts) {
$type = lc($part->mime_type);
if ($type ne "text/html") {
push(@keep, $part);
} else {
$didsomething = 1;
}
}
if ($didsomething) {
$entity->parts(\@keep);
@parts = @keep;
}
}
}
if ($#parts >= 0) {
foreach $part (@parts) {
$didsomething = 1 if ($self->remove_redundant_html_parts($part));
}
}
return $didsomething;
}
# Rewritten from MailScanner HTMLToText()
sub strip_html_helper {
my ($self, $entity) = @_;
my $counter = 0;
return 0 unless $entity && defined($entity->head);
if ($self->is_html_message($entity)) {
$self->convert_html_entity_to_text($entity);
$counter++;
}
my(@parts, $part);
@parts = $entity->parts;
foreach $part (@parts) {
$counter += $self->strip_html_helper($part);
}
return $counter;
}
sub strip_html {
my ($self, $entity) = @_;
my $counter = 0;
$counter += $self->remove_redundant_html_parts($entity);
$counter += $self->strip_html_helper($entity);
return $counter;
}
# Copied from MailScanner
#
# This is an improvement to the default HTML-Parser routine for getting
# the text out of an HTML message. The only difference to their one is
# that I join the array of items together with spaces rather than "".
#
package HTML::TokeParser::MailScanner;
use HTML::Entities qw(decode_entities);
use vars qw(@ISA);
@ISA = qw(HTML::TokeParser);
sub get_text
{
my $self = shift;
my $endat = shift;
my @text;
while (my $token = $self->get_token) {
my $type = $token->[0];
if ($type eq "T") {
my $text = $token->[1];
decode_entities($text) unless $token->[2];
push(@text, $text);
} elsif ($type =~ /^[SE]$/) {
my $tag = $token->[1];
if ($type eq "S") {
if (exists $self->{textify}{$tag}) {
my $alt = $self->{textify}{$tag};
my $text;
if (ref($alt)) {
$text = &$alt(@$token);
} else {
$text = $token->[2]{$alt || "alt"};
$text = "[\U$tag]" unless defined $text;
}
push(@text, $text);
next;
}
} else {
$tag = "/$tag";
}
if (!defined($endat) || $endat eq $tag) {
$self->unget_token($token);
last;
}
}
}
# JKF join("", @text);
join(" ", @text);
}
1;