1 |
#!/usr/bin/perl -w |
2 |
|
3 |
|
4 |
###################################### |
5 |
# |
6 |
# A simple search engine program |
7 |
# |
8 |
###################################### |
9 |
|
10 |
|
11 |
use strict; |
12 |
use CGI::Carp qw(fatalsToBrowser); |
13 |
use Semantic::API; |
14 |
use CGI; |
15 |
use Data::Dump qw/dump/; |
16 |
use EPrints; |
17 |
use Cwd qw/abs_path/; |
18 |
use KinoSearch::Simple; |
19 |
use lib '/home/dpavlin/stem-hr/'; |
20 |
use StemHR; |
21 |
|
22 |
my $debug = 1; |
23 |
|
24 |
my $abs_path = abs_path( $0 ); |
25 |
$abs_path =~ s!/[^/]*$!/!; #!fix-vim |
26 |
|
27 |
############################################################# |
28 |
my $COLLECTION = 'EPrints'; |
29 |
my ( @TERMS, @RESULTS ); |
30 |
my ( $RESULTS_TO_DISPLAY, $TERMS_TO_DISPLAY ) = ( 20, 20 ); |
31 |
############################################################# |
32 |
|
33 |
|
34 |
############################### |
35 |
# CGI Variables |
36 |
############################### |
37 |
my $cgi = new CGI; |
38 |
my $start = $cgi->param( 'start' ) || 0; |
39 |
my $query = $cgi->param( 'query' ) || ''; |
40 |
my $similar = $cgi->param( 'similar' ) || ''; |
41 |
my $slogovi = $cgi->param( 'slogovi' ) || ''; |
42 |
my $stem = $cgi->param( 'stem' ) || ''; |
43 |
my $kino = $cgi->param( 'kino' ); |
44 |
|
45 |
my $charset='iso-8859-2'; |
46 |
|
47 |
my $full_query = $query; |
48 |
$full_query .= " " . EPrints->slogovi( $query ) if ($slogovi); |
49 |
$full_query .= " " . StemHR->stem( $query ) if ($stem); |
50 |
|
51 |
############################## |
52 |
# Start the HTML output |
53 |
############################## |
54 |
print "Content-type: text/html; charset=$charset\n\n"; |
55 |
print qq|<?xml version="1.0" encoding="$charset"?> |
56 |
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" |
57 |
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> |
58 |
<html xmlns="http://www.w3.org/1999/xhtml"> |
59 |
<head> |
60 |
<meta http-equiv="content-type" content="text/html; charset=$charset" /> |
61 |
<title>Search Engine</title> |
62 |
</head> |
63 |
<body> |
64 |
<form method="get" action=""> |
65 |
<p> |
66 |
Enter bunch of related terms to documents you are trying to find: |
67 |
<br/><input type="text" name="query" value="$query" size="80"> |
68 |
<br/><input type="submit" /> |
69 |
|, $cgi->checkbox( -name => 'stem' ), $cgi->checkbox( -name => 'slogovi' ), qq| |
70 |
</p>\n|; |
71 |
|
72 |
|
73 |
|
74 |
########################## |
75 |
# Do the actual search |
76 |
########################## |
77 |
if( $query || $similar ) { |
78 |
|
79 |
# Create collection-based objects |
80 |
my $semantic = Semantic::API::Search->new( storage => 'sqlite', |
81 |
database => "$abs_path/eprints.db", |
82 |
collection => $COLLECTION ); |
83 |
|
84 |
my $score_ponder; |
85 |
|
86 |
my ($results, $terms); |
87 |
if ( $query ) { |
88 |
($results, $terms) = $semantic->semantic_search( $full_query ); |
89 |
|
90 |
my $kino = KinoSearch::Simple->new( |
91 |
path => 'kinoindex/', |
92 |
language => 'ru', |
93 |
); |
94 |
|
95 |
my $total_hits = $kino->search( |
96 |
query => $query, |
97 |
offset => 0, |
98 |
num_wanted => 10, |
99 |
); |
100 |
|
101 |
print qq|<div style="width:20%; float:right;">KinoSearch hits $total_hits for $query\n<ol>|; |
102 |
while ( my $hit = $kino->fetch_hit_hashref ) { |
103 |
print qq|<li><a href="#id_$hit->{id}">$hit->{title}</a> $hit->{score}</li>\n|; |
104 |
$score_ponder->{ $hit->{id} } = $hit->{score}; |
105 |
} |
106 |
print qq|</ol></div>|; |
107 |
|
108 |
} else { |
109 |
($results, $terms) = $semantic->find_similar( $similar ); |
110 |
} |
111 |
|
112 |
warn "results = ",dump( $results ) if $debug; |
113 |
warn "terms = ",dump( $terms ) if $debug; |
114 |
|
115 |
################################## |
116 |
# TERM BASED CALCULATIONS |
117 |
################################## |
118 |
my @sorted_terms = sort { |
119 |
my ( $sb, $sa ) = ( $terms->{$b}, $terms->{$a} ); |
120 |
$sb *= $score_ponder->{$b} if $score_ponder->{$b}; |
121 |
$sa *= $score_ponder->{$a} if $score_ponder->{$a}; |
122 |
$sb <=> $sa; |
123 |
} keys %$terms; |
124 |
my @top_terms = splice( @sorted_terms, $start, $TERMS_TO_DISPLAY ); |
125 |
|
126 |
warn "top_terms = ", dump( @top_terms ) if $debug; |
127 |
|
128 |
print "<p>Full query: $full_query</p>\n"; |
129 |
print "<p>Related Terms: ". ( join ", ", @top_terms ) ."</p>\n"; |
130 |
print "<hr />\n"; |
131 |
|
132 |
|
133 |
################################## |
134 |
# DOCUMENT BASED CALCULATIONS |
135 |
################################## |
136 |
|
137 |
print "<p>Result Count: ".(scalar keys %$results)."</p>\n"; |
138 |
|
139 |
my @sorted_results = sort { $results->{$b} <=> $results->{$a} } keys %$results; |
140 |
my @display_results = splice( @sorted_results, $start, $RESULTS_TO_DISPLAY ); |
141 |
|
142 |
warn "display results = ", dump( @display_results ) if $debug; |
143 |
|
144 |
################################## |
145 |
# Access the storage engine to |
146 |
# retrieve the title and text |
147 |
################################## |
148 |
my $i = 1 + $start; |
149 |
print $semantic->paginate( "?query=$query;similar=$similar;stem=$stem;slogovi=$slogovi", $start, scalar keys %$results, $RESULTS_TO_DISPLAY); |
150 |
foreach my $id ( @display_results ){ |
151 |
EPrints->id( $id ); |
152 |
print "<p><a name=\"id_$id\"/>$i. <b>", EPrints->lookup( 'title' ), "</b>"; |
153 |
print "| score: <em>", sprintf("%.2f",$results->{$id}), "</em> | id: $id | <a href=\"?similar=$id\">similar</a> | "; |
154 |
my ($type,$uri) = EPrints->fulltext; |
155 |
print qq|<a href="$uri">$type</a>|; |
156 |
print "</p>\n"; |
157 |
print "<p>"; |
158 |
# print $semantic->summarize($id); |
159 |
print "</p><p>Keywords: ", EPrints->lookup('keywords'), "</p><p>"; |
160 |
print "<small>", EPrints->lookup('abstract'), "</small>"; |
161 |
print "</p>\n"; |
162 |
$i++; |
163 |
} |
164 |
|
165 |
} |
166 |
|
167 |
|
168 |
print "</body>\n</html>\n"; |