PERL 42
Search.pl Guest on 16th July 2020 08:46:46 AM
  1. #!/usr/local/bin/perl -w
  2. #$rcs = ' $Id: search.pl,v 1.101 2007/03/09 20:50:35 gzervas Exp $ ' ;
  3.  
  4. # Perlfect Search
  5. #
  6. # Copyright (C) 1999-2003 Giorgos Zervas <[email protected]> and
  7. #  Daniel Naber <[email protected]>
  8. #
  9. # This program is free software; you can redistribute it and/or modify
  10. # it under the terms of the GNU General Public License as published by
  11. # the Free Software Foundation; either version 2 of the License, or (at
  12. # your option) any later version.
  13. #
  14. # This program is distributed in the hope that it will be useful, but
  15. # WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17. # General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU General Public License
  20. # along with this program; if not, write to the Free Software
  21. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  22. # USA
  23.  
  24. # Comment in the next two lines to log and show how long searches take:
  25. #use Time::HiRes qw ();
  26. #my $start_time = [Time::HiRes::gettimeofday];
  27.  
  28. use CGI;
  29. # only comment this in for development:
  30. #use CGI::Carp qw(fatalsToBrowser);
  31. use Fcntl;
  32. use POSIX qw(strftime);
  33.  
  34. # added program path to @INC because it fails to find ./conf.pl if started from
  35. # other directory
  36. {
  37.   # block is for $1 not mantaining its value
  38.   $0 =~ /(.*)(\\|\/)/;
  39.   push @INC, $1 if $1;
  40. }
  41. require Perlfect::Template;
  42. # require Perlfect::Sidebar;
  43.  
  44. my $db_package = "";
  45. # To use tainting, comment in the next 2 lines and comment out the next 8 lines.
  46. # Note that you also have to add "./" to the filenames in the require commands.
  47. #use DB_File;
  48. #$db_package = 'DB_File';
  49. package AnyDBM_File;
  50. @ISA = qw(DB_File);
  51. # You may try to comment in the next line if you don't have DB_File. Still
  52. # this is not recommended.
  53. #@ISA = qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File);
  54. foreach my $isa (@ISA) {
  55.   if( eval("require $isa") ) {
  56.     $db_package = $isa;
  57.     last;
  58.   }
  59. }
  60.  
  61. package main;
  62. require 'conf.pl';
  63. require 'tools.pl';
  64. init_config();
  65.  
  66. # See indexer.pl for a description of the data structures:
  67. my %inv_index_db;
  68. my %docs_db;
  69. my %urls_db;
  70. my %sizes_db;
  71. my %desc_db;
  72. my %content_db;
  73. my %titles_db;
  74. my %dates_db;
  75. my %terms_db;
  76.  
  77. tie %inv_index_db, $db_package, $INV_INDEX_DB_FILE, O_RDONLY, 0755 or die "Cannot open $INV_INDEX_DB_FILE: $!";  
  78. tie %docs_db,      $db_package, $DOCS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DOCS_DB_FILE: $!";  
  79. tie %urls_db,      $db_package, $URLS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $URLS_DB_FILE: $!";  
  80. tie %sizes_db,     $db_package, $SIZES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $SIZES_DB_FILE: $!";  
  81. tie %desc_db,      $db_package, $DESC_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DESC_DB_FILE: $!";  
  82. tie %content_db,   $db_package, $CONTENT_DB_FILE, O_RDONLY, 0755 or die "Cannot open $CONTENT_DB_FILE: $!";
  83. tie %titles_db,    $db_package, $TITLES_DB_FILE, O_RDONLY, 0755    or die "Cannot open $TITLES_DB_FILE: $!";  
  84. tie %dates_db,     $db_package, $DATES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DATES_DB_FILE: $!";  
  85. tie %terms_db,     $db_package, $TERMS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $TERMS_DB_FILE: $!";  
  86.  
  87. my (@force, @not, @other);
  88. my (@docs, @valid_docs);
  89. my %answer;
  90.  
  91. build_char_string();
  92. my %stopwords_hash = load_stopwords();
  93. my @stopwords = keys(%stopwords_hash);
  94. my @stopwords_ignored;    # stopwords that are in the user's query
  95. my $punct = ',.!?:"\'/%()-';
  96. my $query;
  97. if( !$ENV{'REQUEST_METHOD'} ) {
  98.   # we are called on the command line
  99.   my $pseudo_lang = 'text';
  100.   $pseudo_lang = 'qa' if( $ARGV[1] && $ARGV[1] eq 'qa' );
  101.   $query = new CGI({'q' => $ARGV[0], 'lang' => $pseudo_lang });    # TODO: add more options
  102. } else {
  103.   # we are called as a CGI
  104.   $query = new CGI;
  105. }
  106. main();
  107.  
  108. sub main {
  109.   if( $query->param('showurl') && $HIGHLIGHT_MATCHES ) {
  110.     my $highlight_doc = showdocument();
  111.     print $highlight_doc;
  112.   } else {
  113.     # initialize everything with empty values (because we might run under mod_perl)
  114.     @force = ();
  115.     @not = ();
  116.     @other = ();
  117.     @docs = ();
  118.     @valid_docs = ();
  119.     %answer = ();
  120.     if (create_query()) { #if some valid documents exist
  121.       apply_booleans();
  122.       answer_query();
  123.     }  
  124.     my $html = cast_template();
  125.     if( $ENV{'REQUEST_METHOD'} ) {
  126.       print "Content-Type: text/html\n\n";
  127.     }
  128.     print $html;
  129.     log_query();
  130.   }
  131. }
  132.  
  133. # Highlight the term(s) in a result document:
  134. sub showdocument {
  135.   print "Content-Type: text/html\n\n";
  136.   my $content = "";
  137.   # security: check if this file was indexed. If not, don't load
  138.   # it, as this would allow loading attacks on any file (or cross site scripting
  139.   # if using $HTTP_START_URL)
  140.   my $url = $query->param('showurl');
  141.   my $file = $url;
  142.   $file =~ s/^$BASE_URL//;
  143.   if( $HTTP_START_URL && isHTML($file) && $urls_db{$url} ) {
  144.     # require = check at run time, so people who don't use $HTTP_START_URL
  145.     # don't need this module
  146.     require LWP::UserAgent;
  147.     my $http_user_agent = LWP::UserAgent->new;
  148.     my $dummy;
  149.     ($dummy,$dummy,$dummy,$content) = get_url($http_user_agent, $url, 1);
  150.     if( ! defined($content) ) {
  151.       $content = "Error: could not retrieve '".cleanup($url)."'\n";
  152.     }
  153.   } elsif( ! $HTTP_START_URL && isHTML($file) && $urls_db{$file} ) {
  154.     $file = $DOCUMENT_ROOT.'/'.$file;    # TODO: make_path() function
  155.     $file =~ s#/{2,}#/#g;
  156.     open(INP, $file) or (return "Error: could not open '".cleanup($url)."': $!\n");
  157.     undef $/;
  158.     $content = (<INP>);
  159.     close(INP);
  160.   } else {
  161.     $content = "Error: getting the file '".cleanup($url)."' is not allowed\n";
  162.   }
  163.  
  164.   my $query_str = cleanup($query->param('q'));
  165.   $query_str =~ s/[+-]//g;
  166.   my @terms = split(" ", $query_str);
  167.   my $ct = 0;
  168.   foreach my $term (@terms) {
  169.     # TODO: add some text at top of <body> (Google style)?
  170.     # fixme: umlaut highlighting!
  171.     $term = normalize_special_chars($term);
  172.     if( is_ignored(remove_accents($term)) ) {
  173.       next;
  174.     }
  175.     $content = highlighthtml($term, $content, $HIGHLIGHT_COLORS[$ct]);
  176.     $ct++;
  177.     if( $ct >= scalar(@HIGHLIGHT_COLORS) ) {
  178.       $ct = 0;
  179.     }
  180.   }
  181.   # Remove old <base> tag:
  182.   $content =~ s/<base.*?>//igs;
  183.   # Insert our own <base> tag:
  184.   $url = cleanup($url);
  185.   if( ! $HTTP_START_URL ) {
  186.     $url = $BASE_URL.$url;
  187.   }
  188.   my ($count_repl) = ($content =~ s/<head>/<head>\n<base href="$url">\n/is);
  189.   if( ! $count_repl ) {
  190.     # maybe the HTML is broken and has no <head>:
  191.     $content = "<base href=\"$url\">\n".$content;
  192.   }
  193.   # don't "forget" line breaks in text files:
  194.   if( $url =~ m/\.txt/i ) {  # TODO: find a better solution (using mime-types)
  195.     $content =~  s/[\r\n]/<br>\n/gs;
  196.   }
  197.   return $content;
  198. }
  199.  
  200. # Make sure to replace the term only in the content of the
  201. # file, i.e. in that part that's typically visible to the
  202. # user (requires <style> and <script> content to be commented out!!)
  203. sub highlighthtml {
  204.   my $term = shift;
  205.   my $content = shift;
  206.   my $color = shift;
  207.   my $content_new = "";
  208.   my @comments = split(/(<!--.*?-->)/is, $content);
  209.   my $in_ignore = 0;
  210.   foreach my $c (@comments) {
  211.     my @tags = split(/(<.*?>)/is, $c);
  212.     foreach my $part (@tags) {
  213.       if( $part !~ m/^</ && ! $in_ignore ) {
  214.         $part = normalize_special_chars($part);
  215.         $part =~ s/\b($term)\b/<highlight>$1<\/highlight>/igs;
  216.         # repair possibly damaged entities:
  217.         $part =~ s/(&\w*)<b>$term<\/b>(\w*;)/$1$term$2/igs;
  218.         # now really highlight:
  219.         $part =~ s/<highlight>($term)<\/highlight>/<span style="color:black;background:$color">$1<\/span>/igs;
  220.       }
  221.       if( $part =~ /<title/i ) {
  222.         $in_ignore = 1;
  223.       } else {
  224.         $in_ignore = 0;
  225.       }
  226.       $content_new .= $part;
  227.     }
  228.   }
  229.   return $content_new;
  230. }
  231.  
  232. sub is_ignored {
  233.   my $buffer = shift;
  234.   my $save = shift;
  235.   if( ! $INDEX_NUMBERS && $buffer =~ m/^\d+$/ ) {
  236.     add_ignored($buffer, $save);
  237.     return 1;
  238.   }
  239.   if( grep(/^\Q$buffer\E$/, @stopwords) || length($buffer) < $MINLENGTH ) {
  240.     add_ignored($buffer, $save);
  241.     return 1;
  242.   } else {
  243.     return 0;
  244.   }
  245. }
  246.  
  247. sub add_ignored {
  248.   my $term = shift;
  249.   my $save = shift;
  250.   if( $save && ! grep(/^\Q$term\E$/, @stopwords_ignored) ) {
  251.     # don't show words twice:
  252.     push(@stopwords_ignored, $term);
  253.   }
  254. }
  255.  
  256. sub create_query {
  257.   my $query_str = cleanup($query->param('q'));
  258.   my $mode = cleanup($query->param('mode'));
  259.   my @terms = split(/\s+/, $query_str);
  260.   my $buffer;
  261.   my ($sterm, $nterm);
  262.  
  263.   # Use an extra loop because the loop below will stop
  264.   # on the first term that's not found if there's an AND search:
  265.   foreach my $term (@terms) {
  266.     is_ignored($term, 1);
  267.   }
  268.  
  269.   foreach my $term (@terms) {
  270.     if( is_ignored($term, 0) ) {
  271.       next;
  272.     }
  273.     $buffer = normalize($term);
  274.     foreach my $nterm (split " ",$buffer) {
  275.       $sterm = stem($nterm);
  276.       # For "Match all words" just add a "+" to every term that has no operator:
  277.       if ( $mode eq 'all' && $term !~ m/^(\+|\-)/ ) {
  278.         $term = '+'.$term;
  279.       }
  280.       if ($term =~ /^\+/) {
  281.         if ($terms_db{$sterm}) {
  282.           push @force, $terms_db{$sterm};
  283.         } else {
  284.           return 0;    # this term was not found, we can stop already
  285.         }
  286.       } elsif ($term =~ /^\-/) {
  287.         push @not, $terms_db{$sterm} if $terms_db{$sterm};
  288.       } else {
  289.         push @other, $terms_db{$sterm} if $terms_db{$sterm};
  290.       }
  291.     }
  292.   }
  293.  
  294.   return 1;
  295. }
  296.  
  297. sub apply_booleans {
  298.   #locate the valid documents by applying the booleans
  299.   my ($term_id, $doc_id, $first_doc_id);
  300.   my %v = ();
  301.   my @ary = ();
  302.   my @not_docs = ();
  303.  
  304.   my %not_docs = ();
  305.   map { $not_docs{$_} = 1 } @not_docs;
  306.  
  307.   foreach $term_id (@not) {
  308.     %v = unpack("S*", $inv_index_db{$term_id});
  309.     foreach $doc_id (keys %v) {
  310.       push @not_docs, $doc_id unless exists $not_docs{$doc_id};
  311.     }
  312.   }
  313.  
  314.   if (@force) {
  315.     $first_doc_id = pop @force;
  316.     %v  = unpack("S*", $inv_index_db{$first_doc_id});
  317.     @valid_docs = keys %v;
  318.     foreach $term_id (@force) {
  319.       %v = unpack("S*", $inv_index_db{$term_id});
  320.       @ary = keys %v;
  321.       @valid_docs = intersection(\@valid_docs, \@ary);
  322.     }
  323.     push @force, $first_doc_id;
  324.   } else {
  325.     @valid_docs = keys %docs_db;
  326.   }
  327.  
  328.   @valid_docs = minus(\@valid_docs, \@not_docs);
  329. }
  330.  
  331. sub answer_query {
  332.   my @term_ids = (@force, @other);
  333.  
  334.   my %valid_docs = ();
  335.   map { $valid_docs{$_} = 1 } @valid_docs;
  336.  
  337.   foreach my $term_id (@term_ids) {
  338.     my %v = unpack('S*', $inv_index_db{$term_id});
  339.     foreach my $doc_id (keys %v) {
  340.       # optionally include only certain files:
  341.       my $include_exp = $query->param('include');
  342.       # TODO: escaping $include_exp/$exclude_exp would disable use of RegExp
  343.       next if( $include_exp && $docs_db{$doc_id} !~ m/$include_exp/i );
  344.       # optionally exclude certain files:
  345.       my $exclude_exp = $query->param('exclude');
  346.       next if( $exclude_exp && $docs_db{$doc_id} =~ m/$exclude_exp/i );
  347.       if( exists $valid_docs{$doc_id} ) {
  348.         my $boost = $answer{$doc_id};
  349.         $answer{$doc_id} += $v{$doc_id};
  350.         $answer{$doc_id} *= $MULTIPLE_MATCH_BOOST if( $MULTIPLE_MATCH_BOOST && $boost );
  351.         if( $query->param('penalty') && $query->param('penalty') != 0 && $dates_db{$doc_id} != -1 ) {
  352.           # increase the rank of new documents by giving old ones a penalty:
  353.           my $age_in_days = (time() - $dates_db{$doc_id})/60/60/24;
  354.           my $penalty = $age_in_days * $query->param('penalty');
  355.           $penalty = 100 if( $penalty > 100 );
  356.           $answer{$doc_id} = $answer{$doc_id} - (($answer{$doc_id}/100) * $penalty);
  357.         }
  358.       }
  359.     }
  360.   }
  361. }
  362.  
  363. # Populate the template with search results. All external data has to be
  364. # accessed via cleanup(), to avoid cross site scripting attacks.
  365. sub cast_template {
  366.   my %h = ();
  367.   my $rank = 0;
  368.  
  369.   my $p = cleanup($query->param('p'));
  370.   my $lang = cleanup($query->param('lang'));
  371.   if( ! ($lang && $SEARCH_TEMPLATE{$lang} && $NO_MATCH_TEMPLATE{$lang}) ) {
  372.     $lang = $DEFAULT_LANG;
  373.   }
  374.   my $include = cleanup($query->param('include'));
  375.   my $exclude = cleanup($query->param('exclude'));
  376.   my $penalty = cleanup($query->param('penalty'));
  377.   my $mode = cleanup($query->param('mode'));
  378.   my $sort = cleanup($query->param('sort'));
  379.   my $q = cleanup($query->param('q'));
  380.  
  381.   my $file;
  382.   if( keys(%answer) == 0 ) {
  383.     # No match found
  384.     $file = $NO_MATCH_TEMPLATE{$lang};
  385.   } else {
  386.     $file = $SEARCH_TEMPLATE{$lang};
  387.   }
  388.   my $template = new Perlfect::Template($file);
  389.   # my $sidebar = new Perlfect::Sidebar();
  390.  
  391.   # %h carries values that will show up in the result page at <!--cgi: key-->:
  392.   $h{'script_name'} = "Perlfect Search $VERSION";
  393.   if( -e $UPDATE_FILE ) {
  394.     $h{'index_update'} = POSIX::strftime($INDEX_DATE_FORMAT,
  395.       localtime((stat($UPDATE_FILE))[9]));
  396.   } else {
  397.     # cannot not happen, an error stops the script anyway if there's
  398.     # no index:
  399.     $h{'index_update'} = "No index built yet";
  400.   }
  401.   $h{'query_str'}   = $q;
  402.   $h{'query_str_escaped'} = my_uri_escape($q);    # can be used to link to other search engines
  403.   $h{'docs_total'} = keys %docs_db;
  404.   $h{'baseurl'} = $BASE_URL;
  405.   $h{'lang'} = $lang;
  406.   $h{'include'} = $include;
  407.   $h{'exclude'} = $exclude;
  408.   $h{'penalty'} = $penalty;
  409.   $h{'sort'} = $sort;
  410.   if( $mode eq 'all' ) {
  411.     $h{'match_all'} = " selected=\"selected\"";
  412.     $h{'match_any'} = "";
  413.   } else {
  414.     $h{'match_all'} = "";
  415.     $h{'match_any'} = " selected=\"selected\"";
  416.   }
  417.   # $h{'sidebar'} = $sidebar->get_html($q);
  418.  
  419.   if( scalar(@stopwords_ignored) > 0 ) {
  420.     my $ignored_terms = join(" ", @stopwords_ignored);
  421.     if( $IGNORED_WORDS{$lang} ) {
  422.       $IGNORED_WORDS{$lang} =~ s/<WORDS>/$ignored_terms/gs;
  423.       $h{'ignored_terms'} = $IGNORED_WORDS{$lang};
  424.     }
  425.   } else {
  426.     $h{'ignored_terms'} = "";
  427.   }
  428.  
  429.   my $current_page = $p;
  430.   $current_page ||= 1;
  431.  
  432.   my ($first, $last);
  433.   if( !$ENV{'REQUEST_METHOD'} ) {
  434.     # Called in a shell, don't limit results
  435.     $first = 0;
  436.     $last = values(%answer);
  437.   } else {
  438.     $first = ($current_page - 1) * $RESULTS_PER_PAGE;
  439.     $last  = $first + $RESULTS_PER_PAGE - 1;
  440.   }
  441.  
  442.   my $percent_factor = 0;
  443.   if( $PERCENTAGE_RANKING ) {
  444.     my $max_score = 0;
  445.     foreach my $doc_ranking (values %answer) {
  446.       $max_score = $doc_ranking if( $doc_ranking > $max_score );
  447.     }
  448.     $percent_factor = 100/$max_score if( $max_score );
  449.   }
  450.  
  451.   my @keys;
  452.   if( defined($query->param('sort')) && $query->param('sort') eq 'title' ) {
  453.     @keys = sort {uc($titles_db{$a}) cmp uc($titles_db{$b})} (keys %answer);
  454.   } else {
  455.     @keys = sort {$answer{$b} <=> $answer{$a}} (keys %answer);
  456.   }
  457.   my $real_last = keys(%answer);
  458.   if( $MAX_RESULTS > 0 ) {
  459.     if( $real_last > $MAX_RESULTS ) {
  460.       $real_last = $MAX_RESULTS;
  461.     }
  462.   }
  463.   if ($last >= $real_last) {
  464.     $last = $real_last - 1;
  465.   }
  466.   $h{'first_number'} = $first+1;
  467.   $h{'last_number'} = $last+1;
  468.  
  469.   my $result_count = 0;
  470.   foreach (@keys[$first..$last]) {
  471.     my $score = $answer{$_};
  472.     if( $PERCENTAGE_RANKING ) {
  473.       $score = sprintf("%.f", $score*$percent_factor);
  474.       $score .= '%';
  475.     } else {
  476.       $score = sprintf("%.2f", $score/100);
  477.     }
  478.     my $desc = get_summary($_, $q);
  479.     my $visible_url;
  480.     if( $HTTP_START_URL ) {
  481.       # we've been fetching pages via http - no need to escape (again):
  482.       $url = $docs_db{$_};
  483.       $visible_url = $docs_db{$_};
  484.     } else {
  485.       $url = $BASE_URL.my_uri_escape($docs_db{$_});
  486.       $visible_url = $BASE_URL.$docs_db{$_};
  487.     }
  488.     my $show_url = CGI::escape($docs_db{$_});
  489.     my $date;
  490.     if( $dates_db{$_} != -1 ) {
  491.       $date = POSIX::strftime($DATE_FORMAT, localtime($dates_db{$_}));
  492.     } else {
  493.       $date = '-';
  494.     }
  495.     my $highlight_link = "";
  496.     if( $HIGHLIGHT_MATCHES && isHTML($url) && $HIGHLIGHT_TERMS{$lang} ) {    # TODO: solve this better...
  497.       $highlight_link = "(<a href=\"$SEARCH_URL?q=".my_uri_escape($q).
  498.         "&amp;showurl=$show_url\">$HIGHLIGHT_TERMS{$lang}</a>)";
  499.     }
  500.     my $title = get_title_highlight($titles_db{$_}, $q);
  501.     $template->cast_loop ("results", [{rank => $first+(++$rank),
  502.                        url => $url,
  503.                        highlight_link => $highlight_link,
  504.                        visibleurl => $visible_url,
  505.                        title => $title,
  506.                        date => $date,
  507.                        score => $score,
  508.                        description => $desc,
  509.                        size => sprintf("%.0f", $sizes_db{$_}/1000) || 1,
  510.                       }]);
  511.     $result_count++;
  512.   }
  513.   $template->finalize("results");
  514.   $h{'results_num'} = $real_last;
  515.  
  516.   my $last_page = ceil($real_last, $RESULTS_PER_PAGE);
  517.   $last_page ||= 1;
  518.   $lang = CGI::escape($lang);
  519.   # Note: Keep order of arguments as in search_form.html to get correct visited link recognition:
  520.   # Note that using "&amp;" is correct, "&" isn't.
  521.   my $queries = "&amp;lang=".CGI::escape($lang);
  522.   $queries .= "&amp;include=".CGI::escape($include);
  523.   $queries .= "&amp;exclude=".CGI::escape($exclude);
  524.   $queries .= "&amp;penalty=".CGI::escape($penalty);
  525.   if( defined($query->param('sort')) ) {
  526.     $queries .= "&amp;sort=".CGI::escape($query->param('sort'));
  527.   }
  528.   $queries .= "&amp;mode=".CGI::escape($mode);
  529.   $queries .= "&amp;q=".CGI::escape($q);
  530.   if( $lang eq 'text' ) {
  531.     # avoid warnings for $NEXT_PAGE{$lang}
  532.     $lang = 'en';
  533.   }
  534.   if ($current_page == 1) {
  535.     $h{'previous'} = "";
  536.     if ($last_page > $current_page) {
  537.       $h{'next'} = "<a href=\"$SEARCH_URL?p=2$queries\">$NEXT_PAGE{$lang}</a>";
  538.     } else {
  539.       $h{'next'} = "";
  540.     }
  541.   } elsif ($current_page == $last_page) {
  542.     $h{'previous'} = "<a href=\"$SEARCH_URL?p=".($last_page-1)."$queries\">$PREV_PAGE{$lang}</a>";
  543.     $h{'next'} = "";
  544.   } else {
  545.     $h{'previous'} = "<a href=\"$SEARCH_URL?p=".($current_page-1)."$queries\">$PREV_PAGE{$lang}</a>";
  546.     $h{'next'} = "<a href=\"$SEARCH_URL?p=".($current_page+1)."$queries\">$NEXT_PAGE{$lang}</a>";
  547.   }
  548.  
  549.   my $start_page = $current_page - 9;
  550.   if ($start_page < 1) {
  551.       $start_page = 1;
  552.   }
  553.   my $end_page   = $current_page + 9;
  554.   if ($end_page > $last_page) {
  555.       $end_page = $last_page;
  556.   }
  557.  
  558.   if ($start_page != 1) {
  559.       $h{'navbar'} .= " ... ";
  560.   }
  561.  
  562.   for ($start_page..$end_page) {
  563.       if ($_ != $current_page) {
  564.           $h{'navbar'} .= "<a href=\"$SEARCH_URL?p=$_$queries\">$_</a> ";
  565.       } else {
  566.           $h{'navbar'} .= "<strong>$_</strong> ";
  567.       }
  568.   }
  569.  
  570.   if ($end_page != $last_page) {
  571.       $h{'navbar'} .= " ... ";
  572.   }
  573.  
  574.   $h{'current_page'} = $current_page;
  575.   $h{'total_pages'}  = $last_page;
  576.   $h{'search_url'}   = $SEARCH_URL;
  577.  
  578.   $h{'search_time'} = '';
  579.   # Show time needed to search:
  580.   if( $start_time ) {
  581.     $h{'search_time'} = sprintf(" in %.2f seconds", Time::HiRes::tv_interval($start_time));
  582.   }
  583.  
  584.   $template->cast(\%h);
  585.   return $template->html;
  586. }
  587.  
  588. sub get_title_highlight {
  589.   my $title = $_[0];
  590.   my @terms = split(" ", normalize_special_chars($_[1]));
  591.   foreach my $term (@terms) {
  592.     $title = term_emphasize($title, $term);
  593.   }
  594.   return $title;
  595. }
  596.  
  597. # Log the query in a file, using this format:
  598. # REMOTE_HOST;date;terms;matches;current page;(time to search in seconds);
  599. # For the last value you need to use Time::HiRes (see top of the script)
  600. sub log_query {
  601.   return if( ! $LOG );
  602.  
  603.   my $elapsed_time = sprintf("%.2f", Time::HiRes::tv_interval($start_time)) if( $start_time );
  604.   my @line = ();
  605.   my $addr = $ENV{'REMOTE_HOST'} || $ENV{'REMOTE_ADDR'};
  606.   push(@line, $addr || '-',
  607.               get_iso_date(),
  608.               $query->param('q') || '-',
  609.               scalar(keys %answer),
  610.               $query->param('p') || 1,
  611.               $elapsed_time || '-');
  612.  
  613.   use Fcntl ':flock';        # import LOCK_* constants
  614.   open(LOG, ">>$LOGFILE") or die "Cannot open logfile '$LOGFILE' for writing: $!";
  615.   flock(LOG, LOCK_EX);
  616.   seek(LOG, 0, 2);
  617.   print LOG join(';', @line).";\n";
  618.   flock(LOG, LOCK_UN);
  619.   close(LOG);
  620. }
  621.  
  622. sub normalize {
  623.   my $buffer = $_[0];
  624.  
  625.   $buffer =~ s/-(\s*\n\s*)?//g; # join parts of hyphenated words
  626.  
  627.   if( $SPECIAL_CHARACTERS ) {
  628.     # We don't have special characters in our index, so don't try to search for them:
  629.     $buffer =~ s/[Жж]/ae/gs;
  630.     $buffer =~ s/[Юю]/th/igs;
  631.     $buffer =~ s/Я/ss/gs;
  632.     $buffer =~ tr/ДЕЖЗИТЙЬУКжЭФЛрзЮХМъсиЯЦНытйаОьукбШПэфлвЩРюхмгЪСяцндЫАоеБшпВщГ/AAACEOEUOEaYOEecTOIunesOIuoeaIuoeaOIyoeaUEtoiaUNyoiaUAiaAoiAuA/;
  633.   }
  634.  
  635.   if ($INDEX_NUMBERS) {
  636.     $buffer =~ s/(<[^>]*>)/ /gs;
  637.   } else {
  638.     $buffer =~ s/(\b\d+\b)|(<[^>]*>)/ /gs;
  639.   }
  640.  
  641.   $buffer =~ tr/a-zA-Z0-9_/ /cs;
  642.   $buffer =~ s/^\s+//;
  643.   $buffer =~ s/\s+$//;
  644.   return lc $buffer;
  645. }
  646.  
  647. # Returns the content of the META description tag or the context of the match,
  648. # if $CONTEXT_SIZE is enabled:
  649. sub get_summary {
  650.   my $id = $_[0];
  651.   my @terms = split(" ", normalize_special_chars($_[1]));
  652.   # +/- operators aren't interesting here:
  653.   foreach my $term (@terms) {
  654.     $term =~ s/^(\+|\-)//;
  655.   }
  656.   my $desc;
  657.   if( $CONTEXT_SIZE ) {
  658.     $desc = get_context($content_db{$id}, @terms);
  659.   }    
  660.   if( ! defined($desc) ) {
  661.     $desc = $desc_db{$id};
  662.     foreach my $term (@terms) {
  663.       $desc = term_emphasize($desc, $term);
  664.     }
  665.   }
  666.   return $desc;
  667. }
  668.  
  669. # Get contexts for all the queried terms. Return "" if no context is found.
  670. sub get_context {
  671.   my $buf = shift;
  672.   my @terms = @_;
  673.   my @contexts;
  674.   foreach my $term (@terms) {
  675.     if( ! is_ignored(remove_accents(normalize_special_chars($term))) ) {
  676.       push(@contexts, get_context_for_term($buf, $term));
  677.     }
  678.   }
  679.   my $context = "";
  680.   my $ct = 0;
  681.   foreach my $result (@contexts) {
  682.     $context .= "...".$result."...";
  683.     $context .= "<br>" if( $ct < scalar(@contexts)-1 );
  684.     $context .= "\n";
  685.     $ct++;
  686.   }
  687.   return $context;
  688. }
  689.  
  690. # Get up to $CONTEXT_EXAMPLES strings for a term.
  691. sub get_context_for_term {
  692.   my $desc = shift;
  693.   my $term = shift;    # is normalized already
  694.   my @contexts = ();
  695.   my $ct = -1;
  696.   my $context_ct = 0;
  697.   # find occurences of a single term:
  698.   my @desc_array_normalized = split(" ", $desc);
  699.   my @desc_array = split(" ", $desc);
  700.   my $last_prev = 0;
  701.   foreach my $term_in_desc (@desc_array_normalized) {
  702.     $ct++;
  703.     $term_in_desc = normalize_special_chars($term_in_desc);
  704.     $term_in_desc = remove_accents($term_in_desc);
  705.     my $term_normalized = remove_accents($term);  # hopefully we don't need normalize_... here
  706.     $term_normalized =~ s/[$punct]//g;
  707.     $term_in_desc =~ s/[$punct]//g;
  708.     if( lc($term_normalized) eq lc($term_in_desc) ) {
  709.  
  710.        # get surroundings of matched word:
  711.        my $first = $ct - int(($CONTEXT_DESC_WORDS/2));
  712.        my $last = $ct + int(($CONTEXT_DESC_WORDS/2));
  713.        if( $first <= 0 ) {
  714.          $first = 0;
  715.        }
  716.        if( $last > scalar(@desc_array) ) {
  717.          $last = scalar(@desc_array)-1;
  718.        }
  719.        # don't repeat context (if matched term are near each other):
  720.        next if ( $first < $last_prev );
  721.        $last_prev = $last;
  722.  
  723.        $context_ct++;
  724.        last if( $context_ct > $CONTEXT_EXAMPLES );
  725.  
  726.        my $context = join(" ", @desc_array[$first..$last]);
  727.        if( $ENV{'REQUEST_METHOD'} ) {
  728.          $context = term_emphasize($context, $term);
  729.        }
  730.        push(@contexts, $context);
  731.     }
  732.   }
  733.   return @contexts;
  734. }
  735.  
  736. sub term_emphasize {
  737.   my $str = $_[0];
  738.   my $term = $_[1];
  739.   # "Hдuser" also matches "Hauser" so be fair and emphasize that, too:
  740.   my $term_no_accents = lc(remove_accents($term));
  741.   $term_no_accents =~ s/\W//g;
  742.   # Emphasize the term. Using a RegEx with \b is not enough, as the term may
  743.   # contain "&" which would be taken as a word boundary:
  744.   my @str_array = split(/([^\w&;]|&nbsp;)/, $str);
  745.   foreach $term_in_str (@str_array) {
  746.     my $term_in_str_compare = $term_in_str;
  747.     $term_in_str_compare =~ s/[$punct]//g;
  748.     $term_in_str_compare =~ s/;;$/;/g;    # special char at the end
  749.     my $term_in_str_no_accents = lc(remove_accents($term_in_str_compare));
  750.     if( lc($term_in_str_compare) eq lc($term) || lc($term_in_str_compare) eq $term_no_accents ||
  751.          $term_in_str_no_accents eq $term_no_accents ) {
  752.       $term_in_str = "<strong>$term_in_str</strong>";
  753.       # do not emphasize punctuation (doesn't work for semicolon):
  754.       $term_in_str =~ s/<strong>([$punct]+)/$1<strong>/;
  755.       $term_in_str =~ s/([$punct]+)<\/strong>/<\/strong>$1/;
  756.     }
  757.   }
  758.   return join("", @str_array);
  759. }
  760.  
  761. sub stem {
  762.   my $str = $_[0];
  763.   $str = substr $str, 0, $STEMCHARS if $STEMCHARS;
  764.   return $str;
  765. }
  766.  
  767. sub ceil {
  768.   my $x = $_[0];
  769.   my $y = $_[1];
  770.  
  771.   if ($x % $y == 0) {
  772.     return $x / $y;
  773.   } else {
  774.     return int($x / $y + 1);
  775.   }
  776. }
  777.  
  778. # Returns an array with elements that are in both @{$ra} and @{$rb}.
  779. sub intersection {
  780.   my ($ra, $rb) = @_;
  781.   my @i;
  782.   # use a hash (instead of grep) for much better speed:
  783.   my %check = ();
  784.   foreach my $element (@{$rb}) {
  785.     $check{$element} = 1;
  786.   }
  787.   foreach my $doc_id (@{$ra}) {
  788.     push @i, $doc_id if( $check{$doc_id} );
  789.   }
  790.   return @i;
  791. }
  792.  
  793. # Returns an array with the elements of @{$ra} minus those of @{$rb}.
  794. sub minus {
  795.   my ($ra, $rb) = @_;
  796.   my @i;
  797.   # use a hash (instead of grep) for much better speed:
  798.   my %check = ();
  799.   foreach my $element (@{$rb}) {
  800.     $check{$element} = 1;
  801.   }
  802.   foreach my $doc_id (@{$ra}) {
  803.     push @i, $doc_id if( ! defined($check{$doc_id}) );
  804.   }
  805.   return @i;
  806. }
  807.  
  808. # Return current date and time in ISO 8601 format, i.e. yyyy-mm-dd hh:mm:ss
  809. sub get_iso_date {
  810.   use Time::localtime;
  811.   my $date = (localtime->year() + 1900).'-'.two_digit(localtime->mon() + 1).'-'.two_digit(localtime->mday());
  812.   my $time = two_digit(localtime->hour()).':'.two_digit(localtime->min()).':'.two_digit(localtime->sec());
  813.   return "$date $time";
  814. }
  815.  
  816. # Returns "0x" for "x" if x is only one digit, otherwise it returns x unmodified.
  817. sub two_digit {
  818.   my $value = $_[0];
  819.   $value = '0'.$value if( length($value) == 1 );
  820.   return $value;  
  821. }
  822.  
  823. # Escape some special characters in URLs. This function escapes each part
  824. # of the path (i.e. parts delimited by "/") on its own.
  825. sub my_uri_escape {
  826.     my $str = shift;
  827.     my @parts = split("(/)", $str);
  828.     foreach my $part (@parts) {
  829.       if( $part ne '/' ) {
  830.         $part = CGI::escape($part);
  831.       }
  832.     }
  833.     $str = join("", @parts);
  834.     return $str;
  835. }
  836.  
  837. # Shut up misguided -w warnings about "used only once". Has no functional meaning.
  838. sub CGI_pl_sillyness {
  839.   my $zz;
  840.   $zz = $DOCUMENT_ROOT;
  841.   $zz = $SPECIAL_CHARACTERS;
  842.   $zz = $VERSION;
  843.   $zz = $INDEX_NUMBERS;
  844.   $zz = $DEFAULT_LANG;
  845.   $zz = $BASE_URL;
  846.   $zz = $CONTEXT_EXAMPLES;
  847.   $zz = $CONTEXT_SIZE;
  848.   $zz = $HTTP_START_URL;
  849.   $zz = $DATE_FORMAT;
  850.   $zz = $INDEX_DATE_FORMAT;
  851.   $zz = $HIGHLIGHT_TERMS;
  852.   $zz = $MINLENGTH;
  853. }

Paste is for source code and general debugging text.

Login or Register to edit, delete and keep track of your pastes and more.

Raw Paste

Login or Register to edit or fork this paste. It's free.