LinuxSA Mailing list archives
Index:
[thread]
[date]
[subject]
[author]
From: Andrew Hill <list@fornax.net>
To : linuxsa <linuxsa@linuxsa.org.au>
Date: Wed, 23 Feb 2000 10:40:05 +1030
Re: Searching the Mailing List archives
michael wrote:
> This comes up reasonable often (searching the archives) Can't we have a
> session at one of the meetings on ht//dig or some search engine that could
> be applied to the archives? Apart from the odd flame war, there's a wealth
> of information in there...
There is also a way to do this with Perl and Berkely's DB thingy.
I've attached some example code if anyone is interested. It should only
need a few changes to work with the list/your web site....
(Hopefully I cut out the surronding HTML gunk that was there without
killing it to badly :-)
--
Andrew Hill
#!/usr/bin/perl
#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
require 'error.pl';
sub parse_form_data {
local (*FORM_DATA) = @_;
# POST or GET?
if ($ENV{'REQUEST_METHOD'} eq "GET") {
$query_string = $ENV{'QUERY_STRING'};
}
elsif ($ENV{'REQUEST_METHOD'} eq "POST") {
read (STDIN, $query_string, $ENV{'CONTENT_LENGTH'});
}
else {
&return_error(500, "Server Error", "Unable to decode form, server uses unsupported method");
exit(1);
}
@value_pairs = split(/&/, $query_string);
foreach $key_value (@value_pairs) {
($key, $value) = split(/=/, $key_value);
$key =~ tr/+/ /;
$key =~ s/%([\dA-Fa-f][\dA-Fa-f])/pack("C", hex($1))/eg;
$value =~ tr/+/ /;
$value =~ s/%([\dA-Fa-f][\dA-Fa-f])/pack("C", hex($1))/eg;
if (defined($FORM_DATA{$key})) {
$FORM_DATA{$key} = join("\0", $FORM_DATA{$key}, $value);
}
else {
$FORM_DATA{$key} = $value;
}
}
}
1;
#!/usr/bin/perl
#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
sub return_error {
local($status, $keyword, $message) = @_;
print "Content-type: text/html", "\n\n";
print <<End_Of_Error;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/loose.dtd">
<HTML>
<HEAD>
<TITLE>Error</TITLE>
</HEAD>
<BODY BGCOLOR="white">
<TABLE WIDTH="100%" BORDER="0" CELLPADDING="10" CELLSPACING="0">
<TR>
<TD>
<FONT FACE="arial">
<BR>
<BR>
<P ALIGN="justify">
<FONT SIZE="+1">
Error: $status - $keyword
</FONT>
</P>
<HR>
<P ALIGN="justify">
$message
</P>
</FONT>
</TD>
</TR>
</TABLE>
</BODY>
</HTML>
End_Of_Error
}
1;
#!/usr/bin/perl
#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
require 5;
use DB_File; # Access DB databases
use Fcntl; # Needed for above...
use File::Find; # Directory searching
undef $/; # Don't obey line boundaries
$currentKey = 0;
chdir("public_html/cgi-bin")
|| die("$!: Could not chdir to public_html/cgi-bin");
#######################################################
# Words that should not be indexed:
$wordfile = "donotindexwordlist.txt";
open(F, $wordfile) || die("$0: couldn't open $wordfile: $!\n");
while(<F>) {
push(@doNotIndex, split);
}
close(F);
#######################################################
# Single database version:
# Stores file entries in index.db as <NULL><binary file number>
# The leading NULL prevents any word entries from colliding.
# Delete old index.db and attach %indexdb to database
unlink("index.db");
tie(%indexdb, 'DB_File', "index.db", O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE);
# Index all subdirecories - excludes the root "index.html" page.
find(\&IndexFile, "../subdir1/");
find(\&IndexFile, "../subdir2/");
# Close database
untie(%indexdb);
#######################################################
sub IndexFile {
if(!-f) { return; } #If not a plain file, return
if(/\.html?$/) { # Handle HTML files
# Print out the current file so we can see which
# file is being processed
print "$File::Find::name\n";
open(HTML_FILE,$_);
my($text) = <HTML_FILE>; # Read entire file into private,
# lexically scoped variable
my($title) = $text; # Read entire file into private,
# lexically scoped variable
# Strip out the HTML Menu from $text
$text =~ s/<!-- BEGIN MENU -->(.*?)<!-- END MENU -->//is;
# Strip out the <HEAD> element from $text
$text =~ s/<HEAD>.*<\/HEAD>//is;
# Strip out all HTML tags from $text
$text =~ s/<[^>]*>//g;
# Get the "title" of the web page - this may or may not be
# the actual HTML title - do some Perl to get it :-)
# $title =~ s/Insert REGEXP here/$1/s;
# Index all the words under the current key
my($wordsIndexed) = &IndexWords($text, $currentKey);
# Map key to this filename
$indexdb{pack"xn",$currentKey} = "$title\ $File::Find::name";
$currentKey++;
}
}
#######################################################
sub IndexWords {
my($words, $fileKey) = @_;
# Split text into Array of words
my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
@words = grep { s/\// /; $_ } # Remove "/"s
@words;
@words = grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # Strip leading punct
grep { length > 1 } # Must be longer than one character
grep { /[a-zA-Z0-9\xc0-\xff]/ } # Must have an alphanumeric
grep { !/\b[0-9]+\b/ } # Must not be just a number
grep { !/\b[0-9]+(am|pm)\b/i } # Must not be a time
grep { !/\b[0-9]+(mm|cm|m|km)\b/i } # Must not be a distance
grep { !/\b[0-9]+(st|nd|rd|th)\b/i } # Must not be a date
@words;
#######################################################
# Words that should not be indexed:
foreach $removeWord (@doNotIndex) {
@words = grep { !/\b$removeWord\b/ } @words;
}
#######################################################
# Print the words out - currently only for testing, so that the list
# of indexed words can be seen, and new non-indexed words and synonyms
# added
print "@words\n\n";
# For each word, add key to word database
foreach (sort @words) {
my($a) = $indexdb{$_};
$a .= pack "n",$fileKey;
$indexdb{$_} = $a;
}
# Return count of words indexed
return scalar(@words);
}
#######################################################
#!/usr/bin/perl
#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
require 5;
use DB_File;
use Fcntl;
require 'parse.pl';
require 'error.pl';
$website = "Your Website In Here";
$contact = "Your Email In Here";
#####################################################
# Words that were not indexed by the
# indexer.pl script:
$wordfile = "public_html/cgi-bin/donotindexwordlist.txt";
if (open(F, $wordfile)) {
while(<F>) {
push(@doNotIndex, split);
}
close(F);
} else {
&return_error(500,"File Open Error","Unable to open the internal word list database.");
exit(0);
}
#######################################################
# Parse the search form data
&parse_form_data(*FORM_DATA);
if ($FORM_DATA{'search'} =~ /[\|\:\;\(\)\*\&\^\%\$\#\@\!\?\>\<\.\,\'\`\~]+/) {
&return_error(500,"Input Error","The input you gave this search is not permitted.<br><br>(Try to avoid using punctuation characters in forms.)");
exit(0);
}
#######################################################
# Print out the search result HTML Page - Part 1
# (static content)
print "content-type: text/html\n\n";
print <<End_Search_Results_Part_1;
<HTML>
<HEAD>
<TITLE>Search Result</TITLE>
</HEAD>
<BODY BGCOLOR="white">
<TABLE WIDTH="100%" BORDER="0" CELLPADDING="10" CELLSPACING="0">
<TR>
<TD>
<FONT FACE="arial">
End_Search_Results_Part_1
#######################################################
# Print out the search result HTML Page
# (dynamic content)
#######################################################
# Open the binary tree databases
tie(%index, DB_File, 'index.db', O_RDONLY, 0, $DB_BTREE);
#######################################################
# Put the search words into @words
@words = split /\s+/, $FORM_DATA{'search'};
#######################################################
# Find any words that _must_ be matched, as indicated
# by a "+", put them in @mustFind, and remove them
# from @words
foreach $word (@words) {
# If the word start with a "+"...
if ($word =~ /\+(.*)/) {
# ... then remove the "+"...
$word =~ s/^\+//;
# ... and store the word in @mustFind
push (@mustFind, $word);
}
}
foreach $word (@mustFind) {
@words = grep { !/\b$word\b/ } @words;
}
#######################################################
# Find any words that _must not_ be matched, as
# indicated by a "-", put them in @mustNotFind,
# and remove them from @words
foreach $word (@words) {
# If the word start with a "-"...
if ($word =~ /\-(.*)/) {
# ... then remove the "-"...
$word =~ s/^\-//;
# ... and store the word in @mustNotFind
push (@mustNotFind, $word);
}
}
foreach $word (@mustNotFind) {
@words = grep { !/\b$word\b/ } @words;
}
#######################################################
# Lookup the files that match words to be found
# If there are words that _must_ be found...
if (scalar(@mustFind) > 0) {
# ... look up each word that _must_ be found (in
# @mustFind) that's in the binary tree database
foreach $word (@mustFind) {
# Find the keys that have $word in them
$keys = $index{lc $word};
foreach $key (unpack("n*",$keys))
{ $matches{$key}++; }
# Now also find each normal word (in @words)
# that's in the binary tree database, but
# ignore any keys that are not already in
# %matches
foreach $normalWord (@words) {
$normalKeys = $index{lc $normalWord};
foreach $key (unpack("n*",$normalKeys)) {
if (exists $matches{$key}) {
$matches{$key}++;
}
}
}
}
}
# If there are no words that _must_ be found...
else {
# ... just look up the each normal word to be found
# (in @words) that's in the binary tree database
foreach $word (@words) {
# ... find the keys that have $word in them
$keys = $index{lc $word};
foreach $key (unpack("n*",$keys)) {
$matches{$key}++;
}
}
}
#######################################################
# Now remove all matches that have words in them that
# _must not_ be found
foreach $word (@mustNotFind) {
# ... find the keys that have $word in them
$keys = $index{lc $word};
foreach $key (unpack("n*",$keys)) {
if (exists $matches{$key}) {
delete $matches{$key};
}
}
}
#######################################################
# Rank results in order by the total number of matches
@matches = sort { $matches{$b} <=> $matches{$a} || $a <=> $b } (keys %matches);
#######################################################
# If no matches, display that result...
if (scalar(@matches) == 0) {
print " <H2>\n Sorry!\n <\/H2>\n";
print " <P ALIGN=\"justify\">\n No documents were found that matched your search for \"";
print "$FORM_DATA{'search'}";
print "\".\n";
# Add each word that was ignored into the @ignored array
foreach $removedWord (@doNotIndex) {
push(@ignored, grep { /\b$removedWord\b/ } @words);
}
# Display ignored word(s), if any
if (scalar(@ignored) == 1) {
print " (The word \"";
print "@ignored";
print "\" was ignored).\n";
}
if (scalar(@ignored) > 1) {
print " (The words \"";
print "@ignored";
print "\" were ignored).\n";
}
print " <P>\n Perhaps you should try searching the Internet with Google?\n <\/P>\n";
print "\n";
print "<\!-- Search Google -->\n";
print "<FORM method=\"GET\" action=\"http:\/\/www.google.com\/search\" NAME=\"google\" TARGET=\"_blank\">\n";
print " <TABLE bgcolor=\"\#FFFFFF\"><TR><TD>\n";
print " <A HREF=\"http:\/\/www.google.com\/\">\n";
print " <IMG SRC=\"http:\/\/www.google.com\/logos\/Logo_40wht.gif\" BORDER=\"0\" WIDTH=\"128\" HEIGHT=\"53\" ALT=\"Google\" align=\"middle\"><\/A>\n";
print " <INPUT TYPE=\"text\" name=\"q\" size=\"31\" maxlength=\"255\" value=\"\">\n";
print " \ \;\ \;\ \;<INPUT type=\"submit\" name=\"sa\" VALUE=\"Search\">\n";
print " <\/TD><\/TR><\/TABLE>\n";
print "<\/FORM>\n";
print "<SCRIPT LANGUAGE=\"javascript\">\n";
print " <\!--\n";
print " function setGoogleSearch() \{\n";
print " document.google.q.value=\'";
print "$FORM_DATA{'search'}";
print "\'\;\n return true\;\n";
print " \}\n";
print " setGoogleSearch()\n";
print " \/\/ -->\n";
print "<\/SCRIPT>\n";
print "<\!-- Search Google -->\n";
}
#######################################################
# ... otherwise display the results of the search
else {
print " <P ALIGN=\"justify\"><FONT SIZE=\"\-1\">\n";
print " <I>You searched for \"";
print "$FORM_DATA{'search'}";
print "\"";
# Add each word that was ignored into the @ignored array
foreach $removedWord (@doNotIndex) {
push(@ignored, grep { /\b$removedWord\b/ } @words);
}
# Display ignored word(s), if any
if (scalar(@ignored) == 1) {
print " (the word \"";
print "@ignored";
print "\" was ignored)";
}
if (scalar(@ignored) > 1) {
print " (the words \"";
print "@ignored";
print "\" were ignored)";
}
print ".<\/I>\n <\/FONT><\/P>\n";
print " <P ALIGN=\"justify\">\n These documents might be what you're looking for.\n";
print " Documents with a higher confidence are probably better matches.\n <\/P>\n";
print " <TABLE BORDER=\"0\">\n";
print " <TR>\n";
print " <TD WIDTH=\"30\%\" ALIGN=\"center\"><FONT FACE=\"arial\">\n";
print " <B>Confidence<\/B>\n";
print " <\/TD>\n";
print " <TD WIDTH=\"10\%\" ALIGN=\"center\"><FONT FACE=\"arial\">\n";
print " <BR>\n";
print " <\/TD>\n";
print " <TD WIDTH=\"60\%\" ALIGN=\"left\"><FONT FACE=\"arial\">\n";
print " <B>Document<\/B>\n";
print " <\/TD>\n";
print " <\/TR>\n";
# Look up keys in the binary tree databse and...
foreach $key (@matches) {
$name = $index{pack("xn",$key)};
# ... split the result into it's components
if (($number, $title, $url) = ("$matches{$key}: $name" =~ /(\d*):(?:\s*?)(.*)(?:\ \.\.)(.*)/s)) {
print " <TR>\n";
print " <TD ALIGN=\"center\"><FONT FACE=\"arial\">\n";
print " $number\n";
# Print the number of hits
print " <\/TD>\n";
print " <TD ALIGN=\"center\"><FONT FACE=\"arial\">\n";
print " <BR>\n";
print " <\/TD>\n";
print " <TD ALIGN=\"left\"><FONT FACE=\"arial\">\n";
# Open the HREF link
print " <A HREF=\"http:\/\/$website\/$url\">";
# Print the title
print "$title";
# Close the link
print "<\/A>\n";
print " <\/TD>\n";
print " <\/TR>\n";
}
}
print " <\/TABLE>\n";
}
#######################################################
# Close the binary tree databases
untie(%index);
#######################################################
# Print out the search result HTML Page - Part 2
# (static content)
print <<End_Search_Results_Part_2;
<BR>
<P ALIGN="justify"><FONT SIZE="-1">
If you can't find something, try searching for a synonym! You can also prefix a word
with a + (plus) or a - (minus) if you want to specify a word that must, or must not
be, found. If you experience problems, or if you find that a word you <I>really</I>
want to search for is being ignored, please email
<A HREF="mailto:$contact">$contact</A>. Thanks.
</FONT></P>
</FONT>
</TD>
</TR>
</TABLE>
</BODY>
</HTML>
End_Search_Results_Part_2
exit (0);
--
LinuxSA WWW: http://www.linuxsa.org.au/ IRC: #linuxsa on irc.linux.org.au
To unsubscribe from the LinuxSA list:
mail linuxsa-request@linuxsa.org.au with "unsubscribe" as the subject
Index:
[thread]
[date]
[subject]
[author]
Return to the LinuxSA Mailing List Information Page