package ONO::Lib::Web::BotDetect;
################################################################################
# COPYRIGHT / LICENSE #
################################################################################
#
# This file is part of the ONO Software Project.
#
# Copyright (C) 2000-2025 Jos KIRPS [ www.kirps.com | jos_AT_kirps_DOT_com ]
# and The Joopita Project [ www.joopita.org | contact_AT_joopita_DOT_com ]
#
# This file, as well as other parts of the ONO Software Project or related
# elements, are FREE SOFTWARE available under the ARTISTIC LICENSE 2.0.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# For the full license, see /ono/osr/license/LICENSE.txt, or write to
# jos_AT_kirps_DOT_com or contact_AT_joopita_DOT_com.
#
################################################################################
# END OF COPYRIGHT / LICENSE, HERE COMES THE CODE ... #
################################################################################
use strict;
#: This module helps with the detection of (either good or bad) bots.
###################################################################
# evil URL strings
###################################################################
my @evils = (
"/\.\./\.\./",
"/\?=PHP",
"/phpunit/",
"login\.jsp",
"wp-login\.php",
);
###################################################################
# tags
###################################################################
sub detect {
#: Detect the most common bots, by checking both HTTP_USER_AGENT
#: and manual input, so that this can be used for both live
#: tracking and post analysis.
if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(bot|crawl|slurp|spider|mediapartners)/gi) {
return 1;
} else {
return 0;
}
}
sub evil_bot {
#: Detect evil bots that should be blocked, by checking both
#: HTTP_USER_AGENT and manual input, so that this can be used
#: for both live tracking and post analysis.
if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(bot|spi|sca|ah|ai|ba|da|im|mj|nb|se|ti|ur|zg)/gi) {
if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(adsbot|adscan|ahrefs|aiohttp|barkro|blexbot|bytespider|datafor|daum.net|imagesift|mj12|nbertau|seekport|semrush|seocom|seokick|seoscan|seostar|serpstat|tineye|urllib|zgrab)/gi) {
return 1;
} else {
return 0;
}
} else {
return 0;
}
}
sub crawler_bot {
#: Detect the most common crawlers, by checking both HTTP_USER_AGENT
#: and manual input, so that this can be used for both live
#: tracking and post analysis.
if ($ENV{'HTTP_USER_AGENT'} =~ /(crawl|robot|backlink|track)/gi) {
return 1;
} else {
return 0;
}
}
sub archive_bot {
#: Detect archive.org bots, by checking both HTTP_USER_AGENT
#: and manual input, so that this can be used for both live
#: tracking and post analysis.
if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(archive\.org|wayback)/gi || "$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /archive-(.*?)\.org/gi) {
return 1;
} else {
return 0;
}
}
sub evil_urls {
#: Check if the current input matches a list of bad strings.
my $evil;
foreach my $line (@evils) {
if ($_[1] =~ $line) {
$evil++;
}
}
return $evil;
}
###############################################################################
# end of script
###############################################################################
1;
__END__