#!/usr/bin/perl # 010577001 Internet Programming # Home assignment 2: Urlsearch.pl (CGI) # 0073829 Susanna Winter # Urlsearch.pl is an add-on to Juha's urllist.cgi # [URL: http://untamo.lnet.lut.fi/scripts/#urllist ]. # Whereas urllist.cgi merely lists the catched URLs (and currently only from # the last seven days), with urlsearch.pl it is possible to search all the # past and current URLs according to different criteria. # This script is located at . # Copyright 2001-2003 by Susanna Winter. # As of early 2005, urllist.cgi has undergone changes from textfile-based to SQL-based. # Therefore, I'm handing over to Juha the task of transforming urlsearch.pl to match the changes, # and ceasing any further updates on the script. # Version history: # v0.1 - Summer 2001: Initial version as an Internet Programming home assignment # v0.2 - Some bug fixes in the dates on the results page sometime in 2002 # v0.3 - March 2003: New layout, added version history # v0.3.1. - May 2003: Minor CSS changes # v0.4 - May 2003: Getting fed up with having to edit numerous scripts every time I # change something in the site layout I've now changed the SSI files to # be read into the scripts rather than have static html code # -> flexible for me but, overall, not very portable. use strict; use File::stat; use Time::localtime; # variable definitions my $filename = "urlsearch.pl"; my (@urlarray, $url_channel, $url_dayofweek, $url_month, $url_day, $url_time, $url_year, $url_nick, $url_address, @query, $buffer, @wanted_daysofweek, $limit); # hash containing all variable - value pairs submitted from the form my %FORM; my @channel_list = ("#example"); my $path_to_files = "\/path\/to\/files\/"; # addresses from where it's allowed to see addresses sent via msg my @msgallowedhosts = (""); # sub's # check if msg option is allowed sub check_host { foreach my $host (@msgallowedhosts) { if ($ENV{'REMOTE_HOST'} eq $host) { return 1; } } } #end of check_host # check selected days of week sub check_dayofweek { # if all days of the week are unselected, accept any if ( scalar(@wanted_daysofweek) == 0 ) { return 1; } foreach my $wanted_dayofweek (@wanted_daysofweek) { if ($url_dayofweek =~ /$wanted_dayofweek/i) { return 1; } } } #end of check_dayofweek # check wanted (or not wanted) nick sub check_nick { # no nickoptions -> accept any nick if ($FORM{'nickoptions'} eq "") { return 1; } # match exact nick (case insensitive) elsif ($FORM{'nickoptions'} eq "match") { if ($url_nick =~ /<$FORM{'nick'}>/i) { return 1; } } # check if nick contains the requested string elsif ($FORM{'nickoptions'} eq "contain") { if ($url_nick =~ /$FORM{'nick'}/i) { return 1; } } # check that nick does not match the string elsif ($FORM{'nickoptions'} eq "exc") { if ($url_nick !~ /<$FORM{'nick'}>/i) { return 1; } } } #end of check_nick # check requested time sub check_time { # if hour fields (or at least on of them) are empty, accept any if ( ($FORM{'time_from'} eq "") || ($FORM{'time_to'} eq "") ) { return 1; } # if the time of the url is between the requested hours my $hour = substr($url_time, 0, 2); if ( ($hour >= $FORM{'time_from'}) && ($hour <= $FORM{'time_to'}) ) { return 1; } } #end of check_time sub check_day { # if no dayoptions, accept any if ($FORM{'dayoptions'} eq "") { return 1; } # check that the day of the url is equal to the one requested if ($FORM{'dayoptions'} eq "equal") { if ($url_day == $FORM{'day'}) { return 1; } } # check that day is greater than the one requested if ($FORM{'dayoptions'} eq "greater") { if ($url_day > $FORM{'day'}) { return 1; } } # less if ($FORM{'dayoptions'} eq "less") { if ($url_day < $FORM{'day'}) { return 1; } } # check that the day is not the one requested if ($FORM{'dayoptions'} eq "exc") { if ( $url_day != $FORM{'day'}) { return 1; } } } #end of check_day sub check_month { # if no monthoptions, accept any if ($FORM{'monthoptions'} eq "") { return 1; } # convert month names to numbers my $fixed_month = $url_month; $fixed_month =~ s/Jan/01/; $fixed_month =~ s/Feb/02/; $fixed_month =~ s/Mar/03/; $fixed_month =~ s/Apr/04/; $fixed_month =~ s/May/05/; $fixed_month =~ s/Jun/06/; $fixed_month =~ s/Jul/07/; $fixed_month =~ s/Aug/08/; $fixed_month =~ s/Sep/09/; $fixed_month =~ s/Oct/10/; $fixed_month =~ s/Nov/11/; $fixed_month =~ s/Dec/12/; # check that the month of the url is equal to the one requested if ($FORM{'monthoptions'} eq "equal") { if ($fixed_month == $FORM{'month'}) { return 1; } } # greater if ($FORM{'monthoptions'} eq "greater") { if ($fixed_month > $FORM{'month'}) { return 1; } } # less if ($FORM{'monthoptions'} eq "less") { if ($fixed_month < $FORM{'month'}) { return 1; } } # check that the month is not the one requested if ($FORM{'monthoptions'} eq "exc") { if ( $fixed_month != $FORM{'month'}) { return 1; } } } #end of check_month sub check_year { # as for check_month if ($FORM{'yearoptions'} eq "") { return 1; } if ($FORM{'yearoptions'} eq "equal") { if ($url_year == $FORM{'year'}) { return 1; } } if ($FORM{'yearoptions'} eq "greater") { if ($url_year > $FORM{'year'}) { return 1; } } if ($FORM{'yearoptions'} eq "less") { if ($url_year < $FORM{'year'}) { return 1; } } if ($FORM{'yearoptions'} eq "exc") { if ( $url_year != $FORM{'year'}) { return 1; } } } #end of check_year # check the search words sub check_query { # accept empty query only if at least one of # the significant form fields is _not_ empty if ( ($FORM{'query'} eq "") && ( !($FORM{'channel'} eq "all") || !($FORM{'nickoptions'} eq "") || !($FORM{'time_from'} eq "") || !($FORM{'time_to'} eq "") || !($FORM{'dayoptions'} eq "") || !($FORM{'monthoptions'} eq "") || !($FORM{'yearoptions'} eq "") ) ) { return 1; } # if the conditions above are not satisfied, do not accept empty search query if ($FORM{'query'} eq "") { return 0; } my ($first_characters, $matched_words) = 0; my $query_word; foreach $query_word (@query) { # searching for search words beginning with "+" or "-" if ($query_word =~ /^(\+|\-)/ ) { $first_characters++; } } # AND-query, all search words must match if ($first_characters == $#query+1) { foreach $query_word (@query) { my $fixed_word = $query_word; # fix wildcards and special characters $fixed_word =~ s/\\/\\\\/g; $fixed_word =~ s/\?/\\\?/g; $fixed_word =~ s/\*/\\\*/g; $fixed_word =~ s/\./\\\./g; $fixed_word =~ s/\$/\\\$/g; $fixed_word =~ s/\^/\\\^/g; $fixed_word =~ s/\|/\\\|/g; $fixed_word =~ s/\(/\\\(/g; $fixed_word =~ s/\)/\\\)/g; $fixed_word =~ s/\[/\\\[/g; $fixed_word =~ s/\]/\\\]/g; # if word starts with "+", it must be matched if ($fixed_word =~ /^\+/) { # if the word is the "+" character by itself if ($fixed_word eq "\+") { if ($url_address =~ /\+/) { return 1; } } $fixed_word =~ s/^\+//; if ($url_address =~ /$fixed_word/i) { $matched_words++; } } # if word starts with "-", it should not be matched elsif ($fixed_word =~ /^\-/) { # but if the word is the "-" character by itself, it must be matched if ($fixed_word eq "\-") { if ($url_address =~ /\-/) { return 1; } if ($url_address !~ /\-/) { return 0; } } $fixed_word =~ s/^\-//; if ($url_address !~ /$fixed_word/i) { $matched_words++; } } #end if } #end of foreach # if all words match if ($matched_words == $#query+1) { return 1; } } #end if # OR-query, at least one word must match else { foreach $query_word (@query) { my $fixed_word = $query_word; $fixed_word =~ s/\\/\\\\/g; $fixed_word =~ s/\?/\\\?/g; $fixed_word =~ s/\*/\\\*/g; $fixed_word =~ s/\./\\\./g; $fixed_word =~ s/\$/\\\$/g; $fixed_word =~ s/\^/\\\^/g; $fixed_word =~ s/\|/\\\|/g; $fixed_word =~ s/\(/\\\(/g; $fixed_word =~ s/\)/\\\)/g; $fixed_word =~ s/\[/\\\[/g; $fixed_word =~ s/\]/\\\]/g; if ($fixed_word =~ /^\+/) { # if the word is the "+" character by itself if ($fixed_word eq "\+") { if ($url_address =~ /\+/) { return 1; } } $fixed_word =~ s/^\+//; if ($url_address =~ /$fixed_word/i) { return 1; } } elsif ($fixed_word =~ /^\-/) { # but if the word is the "-" character by itself, it must be matched if ($fixed_word eq "\-") { if ($url_address =~ /\-/) { return 1; } if ($url_address !~ /\-/) { return 0; } } $fixed_word =~ s/^\-//; if ($url_address !~ /$fixed_word/i) { return 1; } } elsif ($url_address =~ /$fixed_word/i) { return 1; } } #end of foreach } #end if } #end of check_query # sub's declared # printing the beginning of the html file print "Content-type:text/html\n\n"; if ($ENV{'REQUEST_METHOD'} eq 'GET') { $buffer = $ENV{'QUERY_STRING'}; # if query string was empty, print the search form if ($buffer eq "") { print "\tUrlsearch.pl\n\t"; print "\n"; print "\t

Urlsearch.pl

\n\n"; print "
\n

Basic search

\n\n"; print "\n\n\t\n\n
\n\t\n\t\t"; print "\n\t\n\t\n\t\t"; print "\n\t\n\t
Search for on channel(s)\n\t\t
\n\t\t
\n\n"; print "

Advanced search options

\n\n\n\n\t\n\n\n
\n\t\n\t\t\n\t\n\t\n\t\t"; print "\n\t\n\t
Nick  (case insensitive)
\n\t\t\n\t\t
\n\n\t"; print "\n\t\n\t\t\n\t\n\t"; print "\n\t\t\n\t\t\n\t\t\n\t\t\n\t\t\n\t\t\n\t\t\n\t\n\t"; print "\n\t\t\n\t\t\n\t\t\n\t\t\n\t\t\n\t\t\n\t\t\n\t\n\t
Day of the week  (Leave all unselected for any.)
MonTueWedThuFriSatSun
\n\n\t"; print "\n\t\n\t\t\n\t\n\t"; print "\n\t\t\n\t\n\t
Time  (24 hour clock)
-
\n\n\t"; print "\n\t\n\t\t\n\t\t\n\t\t\n\t\n\t"; print "\n\t\t\n\n\t\t"; print "\n\n\t\t"; print "\n\t\n\t
DayMonthYear
\n\t\t\n\t\t\n\t\t
\n\n\t"; print "\n\t\n\t\t\n\t\n\t
\n\t\t
\n\n
\n\n"; } # if query string was submitted, search for matching urls else { # splitting and decoding query string my @pairs = split(/&/, $buffer); foreach my $pair (@pairs) { (my $name, my $value) = split(/=/, $pair); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $FORM{$name} = $value; } # splitting search words @query = split(/ /, $FORM{'query'}); # put selected days of week in an array if ($FORM{'monday'}) { push(@wanted_daysofweek, "mon"); } if ($FORM{'tuesday'}) { push(@wanted_daysofweek, "tue"); } if ($FORM{'wednesday'}) { push(@wanted_daysofweek, "wed"); } if ($FORM{'thursday'}) { push(@wanted_daysofweek, "thu"); } if ($FORM{'friday'}) { push(@wanted_daysofweek, "fri"); } if ($FORM{'saturday'}) { push(@wanted_daysofweek, "sat"); } if ($FORM{'sunday'}) { push(@wanted_daysofweek, "sun"); } # print title and heading with the search query my $fixed_query = $FORM{'query'}; $fixed_query =~ s/&/&/g; $fixed_query =~ s/\@/@/g; $fixed_query =~ s//\>/g; print "\tUrlsearch.pl - Search results for \"$fixed_query\"\n\t"; print "\n"; print "\t

Urlsearch.pl - Search results for \"$fixed_query\"

\n\n"; # read source files to an array # if channel selection is "all" or "msgs", start by reading the msg files if ( ($FORM{'channel'} eq "all") || ($FORM{'channel'} eq "msgs") ) { # read msg files if it is allowed if (check_host()) { # *.old files hold the urls that are older than one week if (-e $path_to_files . "msg/msg.old") { open(INF, $path_to_files . "msg/msg.old") or print "

Could not open file $path_to_files" . "msg/msg.old.

\n"; while (my $fileline = ) { push(@urlarray, $fileline); } close(INF); } # i == number of days in the week, one file per each day for (my $i=7; $i>0; $i--) { # check that file exists and open it if (-e $path_to_files . "msg/msg." . $i) { open(INF, $path_to_files . "msg/msg." . $i) or print "

Could not open file $path_to_files" . "msg/msg.$i.

\n"; while (my $fileline = ) { push(@urlarray, $fileline); } close(INF); } } #end of for } # msg files read if ($FORM{'channel'} eq "all") { # read all channel files foreach my $channel (@channel_list) { if (-e $path_to_files . $channel . "/" . $channel . ".old") { open(INF, $path_to_files . $channel . "/" . $channel . ".old") or print "

Could not open file $path_to_files" . $channel . "/" . $channel . ".old.

\n"; while (my $fileline = ) { push(@urlarray, $fileline); } close(INF); } for (my $i=7; $i>0; $i--) { if (-e $path_to_files . $channel . "/" . $channel . "." . $i) { open(INF, $path_to_files . $channel . "/" . $channel . "." . $i) or print "

Could not open file path_to_files" . $channel . "/" . $channel . "." . $i . ".

\n"; while (my $fileline = ) { push(@urlarray, $fileline); } close(INF); } } #end of for } #end of foreach - channel files read } #end if } #end if - files read for "all"-option # read files for the specified channel else { if (-e $path_to_files . $FORM{'channel'} . "/" . $FORM{'channel'} . ".old") { open(INF, $path_to_files . $FORM{'channel'} . "/" . $FORM{'channel'} . ".old") or print "

Could not open file $path_to_files" . $FORM{'channel'} . "/" . $FORM{'channel'} . ".old.

\n"; while (my $fileline = ) { push(@urlarray, $fileline); } close(INF); } for (my $i=7; $i>0; $i--) { if (-e $path_to_files . $FORM{'channel'} . "/" . $FORM{'channel'} . "." . $i) { open(INF, $path_to_files . $FORM{'channel'} . "/" . $FORM{'channel'} . "." . $i) or print "

Could not open file path_to_files" . $FORM{'channel'} . "/" . $FORM{'channel'} . "." . $i . ".

\n"; while (my $fileline = ) { push(@urlarray, $fileline); } close(INF); } } #end of for } # channel files read my ($matches, $previous_matches, $total_matches, $check) = 0; my ($date, $dateold, $channelname, $channelnameold); # set the limit of how many matches per channel are shown if ($FORM{'channel'} eq "all") { $limit = 100; } else { $limit = 200; } # print the matching urls in reversed order to get the latest urls first my $urls_left = $#urlarray; do { ($url_channel, $url_dayofweek, $url_month, $url_day, $url_time, $url_year, $url_nick, $url_address) = split(/\s+/, $urlarray[$urls_left]); $url_dayofweek =~ s/\[//g; $url_year =~ s/\]//g; chomp($url_address); $date = $url_dayofweek . " " . $url_month . " " . $url_day . ", " . $url_year; $channelname = lc($url_channel); # if channel has changed, reset the counter, get the amount of matches # from the previous channel and add it to total matches if ( !($channelname eq $channelnameold) ) { if ( ($matches != 0) && ($matches != $previous_matches) ) { $previous_matches = $matches; $total_matches += $previous_matches; } $matches = 0; # print the matches from the previous channel if ( ($previous_matches != $limit) && !($channelnameold eq "") && ($check == 0) ) { print "

Found $previous_matches "; if ($previous_matches == 1) { print "match "; } else { print "matches "; } print "from channel $channelnameold.

\n"; # "check" makes sure that the results are printed only once $check = 1; } } else { $check = 0; } # check that all the criteria are satisfied if ( ($matches < $limit ) && check_query() && check_dayofweek() && check_nick() && check_time() && check_day() && check_month() && check_year() ) { $url_address =~ s/&/&/g; $url_address =~ s/\@/@/g; $url_nick =~ s//\>/g; $url_time = substr($url_time, 0, 5); # print channelname if the channel has changed if ( !($channelname eq $channelnameold) ) { if ( !($channelnameold eq "") ) { print "
\n\n"; } print "

$channelname

\n\n"; } # print date if date or channel has changed if ( !($date eq $dateold) || !($channelname eq $channelnameold) ) { if ( !($dateold eq "") && ($channelname eq $channelnameold) ) { print "
\n\n"; } print "  $date
\n"; } # print time and nick print "\[$url_time\] $url_nick "; # if url is a Usenet news message-id if ($url_address =~ /^<.*>$/) { # substitute characters for html $url_address =~ s//\>/g; print "Message-ID: $url_address"; # subsitute characters for Google $url_address =~ s/\</\%3A/g; $url_address =~ s/\>/\%3E/g; print " (Google)
\n"; } else { print "$url_address
\n"; } $matches++; $dateold = $date; $channelnameold = $channelname; if ($matches == $limit) { print "

Displaying first $limit matches from channel $channelname. Please refine your search.

\n"; } } #end if $urls_left--; } while ($urls_left >= 0); # add matches from the last channel to total matches $total_matches += $matches; # print the amount of matches if ($FORM{'channel'} eq "all") { # no matches at all if ($total_matches == 0) { print "

No matches. Try another search.

\n\n"; } # print the amount of matches from the last channel of the array elsif ( ($matches != 0) && ($matches != $limit) ) { print "

Found $matches "; if ($matches == 1) { print "match "; } else { print "matches "; } print "from channel $channelname.

\n\n"; } } # for just one channel else { if ($matches == 0) { print "

No matches. Try another search.

\n\n"; } elsif ($matches != $limit) { print "

Found $matches "; if ($matches == 1) { print "match "; } else { print "matches "; } print "from channel $channelname.

\n\n"; } }#end if }#endif } # if request method was other than "get" else { print "\tUrlsearch.pl - Invalid method\n\t"; print "\n"; print "

Invalid method.

\n\n"; } #print the rest of the page my $last_modified = (ctime(stat($filename)->mtime)); my ($day_of_week, $month, $day, $time, $year) = split(" ", $last_modified, 5); print "$filename last modified $day_of_week $month $day, $year. © 2001-$year by Susanna Winter.
\n"; print "Questions, comments, corrections? Feel free to e-mail me: susanna.winter\@lut.fi
\n\n\n\n";