LinuxSA Mailing list archives

Index: [thread] [date] [subject] [author]
  From: Andrew Hill <list@fornax.net>
  To  : linuxsa <linuxsa@linuxsa.org.au>
  Date: Wed, 23 Feb 2000 10:40:05 +1030

Re: Searching the Mailing List archives

michael wrote:
> This comes up reasonable often (searching the archives) Can't we have a
> session at one of the meetings on ht//dig or some search engine that could
> be applied to the archives? Apart from the odd flame war, there's a wealth
> of information in there...

There is also a way to do this with Perl and Berkely's DB thingy.

I've attached some example code if anyone is interested. It should only
need a few changes to work with the list/your web site....

(Hopefully I cut out the surronding HTML gunk that was there without
killing it to badly :-)

-- 

Andrew Hill
#!/usr/bin/perl

#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net

#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

require 'error.pl';

sub parse_form_data {

    local (*FORM_DATA) = @_;

    # POST or GET?
    if ($ENV{'REQUEST_METHOD'} eq "GET") {
	$query_string = $ENV{'QUERY_STRING'};
    }
    elsif ($ENV{'REQUEST_METHOD'} eq "POST") {
	read (STDIN, $query_string, $ENV{'CONTENT_LENGTH'});
    }
    else {
	&return_error(500, "Server Error", "Unable to decode form, server uses unsupported method");
	exit(1);
    }

    @value_pairs = split(/&/, $query_string);
    foreach $key_value (@value_pairs) {
	($key, $value) = split(/=/, $key_value);
	$key =~ tr/+/ /;
	$key =~ s/%([\dA-Fa-f][\dA-Fa-f])/pack("C", hex($1))/eg;
	$value =~ tr/+/ /;
	$value =~ s/%([\dA-Fa-f][\dA-Fa-f])/pack("C", hex($1))/eg;

	if (defined($FORM_DATA{$key})) {
	    $FORM_DATA{$key} = join("\0", $FORM_DATA{$key}, $value);
	}
	else {
	    $FORM_DATA{$key} = $value;
	}
    }
}


1;

#!/usr/bin/perl

#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net

#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

sub return_error {
    local($status, $keyword, $message) = @_;

    print "Content-type: text/html", "\n\n";

    print <<End_Of_Error;

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
  "http://www.w3.org/TR/REC-html40/loose.dtd">
<HTML>
<HEAD>
  <TITLE>Error</TITLE>
</HEAD>
<BODY BGCOLOR="white">

      <TABLE WIDTH="100%" BORDER="0" CELLPADDING="10" CELLSPACING="0">
        <TR>
          <TD>
            <FONT FACE="arial">

            <BR>
            <BR>
            <P ALIGN="justify">
              <FONT SIZE="+1">
                Error: $status - $keyword
              </FONT>
            </P>
            <HR>
            <P ALIGN="justify">
              $message
            </P>

            </FONT>
          </TD>
        </TR>
      </TABLE>

</BODY>
</HTML>

End_Of_Error
}

1;

#!/usr/bin/perl

#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net

#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

require 5;
use DB_File;     # Access DB databases
use Fcntl;       # Needed for above...
use File::Find;  # Directory searching
undef $/;        # Don't obey line boundaries
$currentKey = 0;

chdir("public_html/cgi-bin")
  || die("$!: Could not chdir to public_html/cgi-bin");

  #######################################################
  # Words that should not be indexed:

  $wordfile = "donotindexwordlist.txt";
  open(F, $wordfile) || die("$0: couldn't open $wordfile: $!\n");
  while(<F>) {
    push(@doNotIndex, split);
  }
  close(F);

#######################################################
#  Single database version:
#  Stores file entries in index.db as <NULL><binary file number>
#  The leading NULL prevents any word entries from colliding.

# Delete old index.db and attach %indexdb to database
unlink("index.db");
tie(%indexdb, 'DB_File', "index.db", O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE);

# Index all subdirecories - excludes the root "index.html" page.
find(\&IndexFile, "../subdir1/");
find(\&IndexFile, "../subdir2/");

# Close database
untie(%indexdb);

#######################################################

sub IndexFile {
    if(!-f) { return; } #If not a plain file, return
    if(/\.html?$/) {    # Handle HTML files
        # Print out the current file so we can see which 
        # file is being processed
	print "$File::Find::name\n";
	open(HTML_FILE,$_);
	my($text) = <HTML_FILE>;  # Read entire file into private, 
                                  # lexically scoped variable
        my($title) = $text;       # Read entire file into private, 
                                  # lexically scoped variable
        # Strip out the HTML Menu from $text
        $text =~ s/<!-- BEGIN MENU -->(.*?)<!-- END MENU -->//is;
        # Strip out the <HEAD> element from $text
        $text =~ s/<HEAD>.*<\/HEAD>//is;
        # Strip out all HTML tags from $text
	$text =~ s/<[^>]*>//g;

        # Get the "title" of the web page - this may or may not be
        # the actual HTML title - do some Perl to get it :-)
        # $title =~ s/Insert REGEXP here/$1/s;

	# Index all the words under the current key
	my($wordsIndexed) = &IndexWords($text, $currentKey);
	# Map key to this filename
	$indexdb{pack"xn",$currentKey} = "$title\ $File::Find::name";
	$currentKey++;
    }
}

#######################################################

sub IndexWords {
    my($words, $fileKey) = @_;
    # Split text into Array of words
    my(@words) = split(/[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words);
    @words = grep { s/\// /; $_ } # Remove "/"s
             @words;
    @words = grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # Strip leading punct
             grep { length > 1 } # Must be longer than one character
             grep { /[a-zA-Z0-9\xc0-\xff]/ } # Must have an alphanumeric
             grep { !/\b[0-9]+\b/ } # Must not be just a number
             grep { !/\b[0-9]+(am|pm)\b/i } # Must not be a time
             grep { !/\b[0-9]+(mm|cm|m|km)\b/i } # Must not be a distance
             grep { !/\b[0-9]+(st|nd|rd|th)\b/i } # Must not be a date
             @words;

          #######################################################
          # Words that should not be indexed:

          foreach $removeWord (@doNotIndex) {
            @words = grep { !/\b$removeWord\b/ } @words;
          }

          #######################################################

    # Print the words out - currently only for testing, so that the list
    # of indexed words can be seen, and new non-indexed words and synonyms
    # added
    print "@words\n\n";

    # For each word, add key to word database
    foreach (sort @words) {
      my($a) = $indexdb{$_};
      $a .= pack "n",$fileKey;
      $indexdb{$_} = $a;
    }
    # Return count of words indexed
    return scalar(@words);
}

#######################################################

#!/usr/bin/perl

#Copyright (C) 1999 Andrew Hill
#Email: andrew@fornax.net

#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#See http://www.gnu.org/copyleft/gpl.html for details

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

require 5;
use DB_File;
use Fcntl;
require 'parse.pl';
require 'error.pl';

$website = "Your Website In Here";
$contact = "Your Email In Here";

  #####################################################
  # Words that were not indexed by the 
  # indexer.pl script:

  $wordfile = "public_html/cgi-bin/donotindexwordlist.txt";
  if (open(F, $wordfile)) {
    while(<F>) {
      push(@doNotIndex, split);
    }
    close(F);
  } else {
    &return_error(500,"File Open Error","Unable to open the internal word list database.");
    exit(0);
  }

#######################################################
# Parse the search form data

&parse_form_data(*FORM_DATA);

if ($FORM_DATA{'search'} =~ /[\|\:\;\(\)\*\&\^\%\$\#\@\!\?\>\<\.\,\'\`\~]+/) {
  &return_error(500,"Input Error","The input you gave this search is not permitted.<br><br>(Try to avoid using punctuation characters in forms.)");
  exit(0);
}

#######################################################
# Print out the search result HTML Page - Part 1
# (static content)

print "content-type: text/html\n\n";

print <<End_Search_Results_Part_1;

<HTML>
<HEAD>
  <TITLE>Search Result</TITLE>
</HEAD>
<BODY BGCOLOR="white">

      <TABLE WIDTH="100%" BORDER="0" CELLPADDING="10" CELLSPACING="0">
        <TR>
          <TD>
            <FONT FACE="arial">

End_Search_Results_Part_1

#######################################################
# Print out the search result HTML Page
# (dynamic content)

#######################################################
# Open the binary tree databases
tie(%index, DB_File, 'index.db', O_RDONLY, 0, $DB_BTREE);

#######################################################
# Put the search words into @words
@words = split /\s+/, $FORM_DATA{'search'};

#######################################################
# Find any words that _must_ be matched, as indicated
# by a "+", put them in @mustFind, and remove them 
# from @words
foreach $word (@words) {
  # If the word start with a "+"...
  if ($word =~ /\+(.*)/) {
    # ... then remove the "+"...
    $word =~ s/^\+//;
    # ... and store the word in @mustFind
    push (@mustFind, $word);
  }
}
foreach $word (@mustFind) {
  @words = grep { !/\b$word\b/ } @words;
}

#######################################################
# Find any words that _must not_ be matched, as 
# indicated by a "-", put them in @mustNotFind,
# and remove them from @words
foreach $word (@words) {
  # If the word start with a "-"...
  if ($word =~ /\-(.*)/) {
    # ... then remove the "-"...
    $word =~ s/^\-//;
    # ... and store the word in @mustNotFind
    push (@mustNotFind, $word);
  }
}
foreach $word (@mustNotFind) {
  @words = grep { !/\b$word\b/ } @words;
}

#######################################################
# Lookup the files that match words to be found
# If there are words that _must_ be found...
if (scalar(@mustFind) > 0) {
  # ... look up each word that _must_ be found (in
  # @mustFind) that's in the binary tree database
  foreach $word (@mustFind) {
    # Find the keys that have $word in them
    $keys = $index{lc $word};
    foreach $key (unpack("n*",$keys))
      { $matches{$key}++; }
    # Now also find each normal word (in @words) 
    # that's in the binary tree database, but 
    # ignore any keys that are not already in
    # %matches
    foreach $normalWord (@words) {
      $normalKeys = $index{lc $normalWord};
      foreach $key (unpack("n*",$normalKeys)) {
        if (exists $matches{$key}) {
          $matches{$key}++;
        }
      }
    }
  }
}
# If there are no words that _must_ be found...
else {
  # ... just look up the each normal word to be found
  # (in @words) that's in the binary tree database
  foreach $word (@words) {
    # ... find the keys that have $word in them
    $keys = $index{lc $word};
    foreach $key (unpack("n*",$keys)) {
      $matches{$key}++;
    }
  }
}

#######################################################
# Now remove all matches that have words in them that
# _must not_ be found
foreach $word (@mustNotFind) {
  # ... find the keys that have $word in them
  $keys = $index{lc $word};
  foreach $key (unpack("n*",$keys)) {
    if (exists $matches{$key}) {
      delete $matches{$key};
    }
  }
}

#######################################################
# Rank results in order by the total number of matches
@matches = sort { $matches{$b} <=> $matches{$a} || $a <=> $b } (keys %matches);

#######################################################
# If no matches, display that result...
if (scalar(@matches) == 0) {
  print "  <H2>\n    Sorry!\n  <\/H2>\n";
  print "  <P ALIGN=\"justify\">\n    No documents were found that matched your search for \"";
  print "$FORM_DATA{'search'}";
  print "\".\n";
  # Add each word that was ignored into the @ignored array
  foreach $removedWord (@doNotIndex) {
    push(@ignored, grep { /\b$removedWord\b/ } @words);
  }
  # Display ignored word(s), if any
  if (scalar(@ignored) == 1) {
    print "    (The word \"";
    print "@ignored";
    print "\" was ignored).\n";
  }
  if (scalar(@ignored) > 1) {
    print "    (The words \"";
    print "@ignored";
    print "\" were ignored).\n";
  }
  print "  <P>\n    Perhaps you should try searching the Internet with Google?\n  <\/P>\n";
  print "\n";
  print "<\!-- Search Google -->\n";
  print "<FORM method=\"GET\" action=\"http:\/\/www.google.com\/search\" NAME=\"google\" TARGET=\"_blank\">\n";
  print "  <TABLE bgcolor=\"\#FFFFFF\"><TR><TD>\n";
  print "    <A HREF=\"http:\/\/www.google.com\/\">\n";
  print "    <IMG SRC=\"http:\/\/www.google.com\/logos\/Logo_40wht.gif\" BORDER=\"0\" WIDTH=\"128\" HEIGHT=\"53\" ALT=\"Google\" align=\"middle\"><\/A>\n";
  print "    <INPUT TYPE=\"text\" name=\"q\" size=\"31\" maxlength=\"255\" value=\"\">\n";
  print "    \ \;\ \;\ \;<INPUT type=\"submit\" name=\"sa\" VALUE=\"Search\">\n";
  print "  <\/TD><\/TR><\/TABLE>\n";
  print "<\/FORM>\n";
  print "<SCRIPT LANGUAGE=\"javascript\">\n";
  print "  <\!--\n";
  print "    function setGoogleSearch() \{\n";
  print "      document.google.q.value=\'";
  print "$FORM_DATA{'search'}";
  print "\'\;\n      return true\;\n";
  print "    \}\n";
  print "    setGoogleSearch()\n";
  print "  \/\/ -->\n";
  print "<\/SCRIPT>\n";
  print "<\!-- Search Google -->\n";
}

#######################################################
# ... otherwise display the results of the search
else {
  print "  <P ALIGN=\"justify\"><FONT SIZE=\"\-1\">\n";
  print "    <I>You searched for \"";
  print "$FORM_DATA{'search'}";
  print "\"";
  # Add each word that was ignored into the @ignored array
  foreach $removedWord (@doNotIndex) {
    push(@ignored, grep { /\b$removedWord\b/ } @words);
  }
  # Display ignored word(s), if any
  if (scalar(@ignored) == 1) {
    print " (the word \"";
    print "@ignored";
    print "\" was ignored)";
  }
  if (scalar(@ignored) > 1) {
    print " (the words \"";
    print "@ignored";
    print "\" were ignored)";
  }
  print ".<\/I>\n  <\/FONT><\/P>\n";
  print "  <P ALIGN=\"justify\">\n    These documents might be what you're looking for.\n";
  print "    Documents with a higher confidence are probably better matches.\n  <\/P>\n";
  print "    <TABLE BORDER=\"0\">\n";
  print "      <TR>\n";
  print "        <TD WIDTH=\"30\%\" ALIGN=\"center\"><FONT FACE=\"arial\">\n";
  print "          <B>Confidence<\/B>\n";
  print "        <\/TD>\n";
  print "        <TD WIDTH=\"10\%\" ALIGN=\"center\"><FONT FACE=\"arial\">\n";
  print "          <BR>\n";
  print "        <\/TD>\n";
  print "        <TD WIDTH=\"60\%\" ALIGN=\"left\"><FONT FACE=\"arial\">\n";
  print "          <B>Document<\/B>\n";
  print "        <\/TD>\n";
  print "      <\/TR>\n";
  # Look up keys in the binary tree databse and...
  foreach $key (@matches) {
    $name = $index{pack("xn",$key)};
    # ... split the result into it's components
    if (($number, $title, $url) = ("$matches{$key}: $name" =~ /(\d*):(?:\s*?)(.*)(?:\ \.\.)(.*)/s)) {
      print "      <TR>\n";
      print "        <TD ALIGN=\"center\"><FONT FACE=\"arial\">\n";
      print "          $number\n";
      # Print the number of hits
      print "        <\/TD>\n";
      print "        <TD ALIGN=\"center\"><FONT FACE=\"arial\">\n";
      print "          <BR>\n";
      print "        <\/TD>\n";
      print "        <TD ALIGN=\"left\"><FONT FACE=\"arial\">\n";
      # Open the HREF link
      print "          <A HREF=\"http:\/\/$website\/$url\">";
      # Print the title
      print "$title";
      # Close the link
      print "<\/A>\n";
      print "        <\/TD>\n";
      print "      <\/TR>\n";
    }
  }
  print "    <\/TABLE>\n";
}

#######################################################
# Close the binary tree databases
untie(%index);

#######################################################
# Print out the search result HTML Page - Part 2
# (static content)

print <<End_Search_Results_Part_2;

            <BR>
            <P ALIGN="justify"><FONT SIZE="-1">
              If you can't find something, try searching for a synonym! You can also prefix a word 
              with a + (plus) or a - (minus) if you want to specify a word that must, or must not 
              be, found. If you experience problems, or if you find that a word you <I>really</I> 
              want to search for is being ignored, please email
              <A HREF="mailto:$contact">$contact</A>. Thanks.

            </FONT></P>


            </FONT>
          </TD>
        </TR>
      </TABLE>

</BODY>
</HTML>

End_Search_Results_Part_2

exit (0);

-- 
LinuxSA WWW: http://www.linuxsa.org.au/  IRC: #linuxsa on irc.linux.org.au
To unsubscribe from the LinuxSA list:
  mail linuxsa-request@linuxsa.org.au with "unsubscribe" as the subject


Index: [thread] [date] [subject] [author]
Return to the LinuxSA Mailing List Information Page