ONO::Lib::Web::BotDetect

package ONO::Lib::Web::BotDetect;
################################################################################
# COPYRIGHT / LICENSE #
################################################################################
#
# This file is part of the ONO Software Project.
#
# Copyright (C) 2000-2025 Jos KIRPS [ www.kirps.com | jos_AT_kirps_DOT_com ]
# and The Joopita Project [ www.joopita.org | contact_AT_joopita_DOT_com ]
#
# This file, as well as other parts of the ONO Software Project or related
# elements, are FREE SOFTWARE available under the ARTISTIC LICENSE 2.0.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# For the full license, see /ono/osr/license/LICENSE.txt, or write to
# jos_AT_kirps_DOT_com or contact_AT_joopita_DOT_com.
#
################################################################################
# END OF COPYRIGHT / LICENSE, HERE COMES THE CODE ... #
################################################################################

use strict;

#: This module helps with the detection of (either good or bad) bots.

###################################################################
# evil URL strings
###################################################################

my @evils = (
"/\.\./\.\./",
"/\?=PHP",
"/phpunit/",
"login\.jsp",
"wp-login\.php",
);

###################################################################
# tags
###################################################################

sub detect {

#: Detect the most common bots, by checking both HTTP_USER_AGENT
#: and manual input, so that this can be used for both live
#: tracking and post analysis.

if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(bot|crawl|slurp|spider|mediapartners)/gi) {

return 1;

} else {

return 0;

}

}

sub evil_bot {

#: Detect evil bots that should be blocked, by checking both
#: HTTP_USER_AGENT and manual input, so that this can be used
#: for both live tracking and post analysis.

if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(bot|spi|sca|ah|ai|ba|da|im|mj|nb|se|ti|ur|zg)/gi) {

if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(adsbot|adscan|ahrefs|aiohttp|barkro|blexbot|bytespider|datafor|daum.net|imagesift|mj12|nbertau|seekport|semrush|seocom|seokick|seoscan|seostar|serpstat|tineye|urllib|zgrab)/gi) {

return 1;

} else {

return 0;

}

} else {

return 0;

}

}

sub crawler_bot {

#: Detect the most common crawlers, by checking both HTTP_USER_AGENT
#: and manual input, so that this can be used for both live
#: tracking and post analysis.

if ($ENV{'HTTP_USER_AGENT'} =~ /(crawl|robot|backlink|track)/gi) {

return 1;

} else {

return 0;

}

}

sub archive_bot {

#: Detect archive.org bots, by checking both HTTP_USER_AGENT
#: and manual input, so that this can be used for both live
#: tracking and post analysis.

if ("$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /(archive\.org|wayback)/gi || "$ENV{'HTTP_USER_AGENT'}/$_[1]" =~ /archive-(.*?)\.org/gi) {

return 1;

} else {

return 0;

}

}

sub evil_urls {

#: Check if the current input matches a list of bad strings.

my $evil;

foreach my $line (@evils) {
if ($_[1] =~ $line) {
$evil++;
}
}

return $evil;

}

###############################################################################
# end of script
###############################################################################

1;

__END__