5 |
use warnings; |
use warnings; |
6 |
use HTML::Entities; |
use HTML::Entities; |
7 |
|
|
8 |
our $VERSION = '0.04'; |
our $VERSION = '0.07_01'; |
9 |
|
|
10 |
use Exporter 'import'; |
use Exporter; |
11 |
use Carp; |
use Carp; |
12 |
|
|
13 |
our @ISA = qw(Exporter); |
our @ISA = qw(Exporter); |
14 |
|
|
15 |
BEGIN { |
BEGIN { |
16 |
import 'jsFind::Node'; |
Exporter::import 'jsFind::Node'; |
17 |
} |
} |
18 |
|
|
19 |
=head1 NAME |
=head1 NAME |
20 |
|
|
21 |
jsFind - generate index for jsFind using B-Tree |
jsFind - generate index for full text search engine in JavaScript |
22 |
|
|
23 |
=head1 SYNOPSIS |
=head1 SYNOPSIS |
24 |
|
|
53 |
|
|
54 |
=item * |
=item * |
55 |
|
|
56 |
You can programatically (and incrementaly) create index for jsFind |
you can programatically (and incrementaly) create index for jsFind |
57 |
|
|
58 |
|
=item * |
59 |
|
|
60 |
|
you can create more than one index and search them using same C<search.html> |
61 |
|
page |
62 |
|
|
63 |
=back |
=back |
64 |
|
|
65 |
You can also examine examples which come as tests with this module, |
You can also examine examples which come as tests with this module, |
66 |
for example C<t/04words.t>. |
for example C<t/04words.t> or C<t/10homer.t>. |
67 |
|
|
68 |
|
=head2 jsFind |
69 |
|
|
70 |
|
jsFind search engine was written by Shawn Garbett from eLucid Software. |
71 |
|
The search engine itself is a small piece of JavaScript (1.2 with level 2 |
72 |
|
DOM). It is easily customizable to fit into a current set of HTML. This |
73 |
|
JavaScript searches an XML index dataset for the appropriate links, and can |
74 |
|
filter and sort the results. |
75 |
|
|
76 |
|
JavaScript code distributed with this module is based on version 0.0.3 which |
77 |
|
was current when this module development started. Various changes where done |
78 |
|
on JavaScript code to fix bugs, add features and remove warnings. For |
79 |
|
complete list see C<Changes> file which comes with distribution. |
80 |
|
|
81 |
|
This module has been tested using C<html/test.html> with following browsers: |
82 |
|
|
83 |
|
=over 5 |
84 |
|
|
85 |
|
=item Mozilla FireFox 0.8 to 1.0 |
86 |
|
|
87 |
|
using DOM 2 C<document.implementation.createDocument> |
88 |
|
|
89 |
|
=item Internet Explorer 5.5 and 6.0 |
90 |
|
|
91 |
|
using ActiveX C<Microsoft.XMLDOM> or C<MSXML2.DOMDocument> |
92 |
|
|
93 |
|
=item Konqueror 3.3 |
94 |
|
|
95 |
|
using DOM 2 C<document.implementation.createDocument> |
96 |
|
|
97 |
|
=item Opera 7.54 (without Java) |
98 |
|
|
99 |
|
using experimental iframe implementation which is much slower than other methods. |
100 |
|
|
101 |
|
=back |
102 |
|
|
103 |
|
If searching doesn't work for your combination of operating system and |
104 |
|
browser, please open C<html/test.html> file and wait a while. It will search sample |
105 |
|
file included with distribution and report results. Reports with included |
106 |
|
test debugging are welcomed. |
107 |
|
|
108 |
=head1 jsFind methods |
=head1 jsFind methods |
109 |
|
|
358 |
Create xml index files for jsFind. This should be called after |
Create xml index files for jsFind. This should be called after |
359 |
your B-Tree has been filled with data. |
your B-Tree has been filled with data. |
360 |
|
|
361 |
$root->to_jsfind('/full/path/to/index/dir/'); |
$root->to_jsfind( |
362 |
|
dir => '/full/path/to/index/dir/', |
363 |
|
data_codepage => 'ISO-8859-2', |
364 |
|
index_codepage => 'UTF-8', |
365 |
|
output_filter => sub { |
366 |
|
my $t = shift || return; |
367 |
|
$t =~ s/è/e/; |
368 |
|
} |
369 |
|
); |
370 |
|
|
371 |
|
All options except C<dir> are optional. |
372 |
|
|
373 |
Returns number of nodes in created tree. |
Returns number of nodes in created tree. |
374 |
|
|
375 |
There is also longer version if you want to recode your data charset |
Options: |
376 |
into different one (probably UTF-8): |
|
377 |
|
=over 4 |
378 |
|
|
379 |
|
=item dir |
380 |
|
|
381 |
|
Full path to directory for index (which will be created if needed). |
382 |
|
|
383 |
|
=item data_codepage |
384 |
|
|
385 |
|
If your imput data isn't in C<ISO-8859-1> encoding, you will have to specify |
386 |
|
this option. |
387 |
|
|
388 |
$root->to_jsfind('/full/path/to/index/dir/','ISO-8859-2','UTF-8'); |
=item index_codepage |
389 |
|
|
390 |
Destination encoding is UTF-8 by default, so you don't have to specify it. |
If your index encoding is not C<UTF-8> use this option. |
391 |
|
|
392 |
$root->to_jsfind('/full/path/to/index/dir/','WINDOWS-1250'); |
If you are not using supplied JavaScript search code, or your browser is |
393 |
|
terribly broken and thinks that index shouldn't be in UTF-8 encoding, use |
394 |
|
this option to specify encoding for created XML index. |
395 |
|
|
396 |
|
=item output_filter |
397 |
|
|
398 |
|
B<this is just draft of documentation for option which is not implemented!> |
399 |
|
|
400 |
|
Code ref to sub which can do modifications on resulting XML file for node. |
401 |
|
Encoding of this data will be in L<index_codepage> and you have to take care |
402 |
|
not to break XML structure. Calling L<xmllint> on your result index |
403 |
|
(like C<t/90xmllint.t> does in this distribution) is a good idea after using |
404 |
|
this option. |
405 |
|
|
406 |
|
This option is also right place to plug in unaccenting function using |
407 |
|
L<Text::Unaccent>. |
408 |
|
|
409 |
|
=back |
410 |
|
|
411 |
=cut |
=cut |
412 |
|
|
416 |
sub to_jsfind { |
sub to_jsfind { |
417 |
my $self = shift; |
my $self = shift; |
418 |
|
|
419 |
my $path = shift || confess "to_jsfind need path to your index!"; |
my %arg = @_; |
420 |
|
|
421 |
my ($from_cp,$to_cp) = @_; |
confess "to_jsfind need path to your index directory !" unless ($arg{'dir'}); |
422 |
|
|
423 |
$to_cp ||= 'UTF-8'; |
my $data_codepage = $arg{'data_codepage'}; |
424 |
|
my $index_codepage = $arg{'index_codepage'} || 'UTF-8'; |
425 |
|
|
426 |
if ($from_cp && $to_cp) { |
# create ISO-8859-1 iconv for HTML::Entities decode |
427 |
$iconv = Text::Iconv->new($from_cp,$to_cp); |
$iconv_l1 = Text::Iconv->new('ISO-8859-1',$index_codepage); |
|
} |
|
|
$iconv_l1 = Text::Iconv->new('ISO-8859-1',$to_cp); |
|
428 |
|
|
429 |
$path .= "/" if ($path =~ /\/$/); |
# create another iconv for data |
430 |
#carp "creating directory for index '$path'" if (! -w $path); |
if ($data_codepage && $index_codepage) { |
431 |
|
$iconv = Text::Iconv->new($data_codepage,$index_codepage); |
432 |
|
} |
433 |
|
|
434 |
return $self->root->to_jsfind($path,"0"); |
return $self->root->to_jsfind($arg{'dir'},"0"); |
435 |
} |
} |
436 |
|
|
437 |
|
|
860 |
return $d; |
return $d; |
861 |
} |
} |
862 |
|
|
863 |
=head2 base62 |
=head2 base_x |
864 |
|
|
865 |
Convert number to base62 (used for jsFind index filenames). |
Convert number to base x (used for jsFind index filenames). |
866 |
|
|
867 |
my $n = $tree->base62(50); |
my $n = $tree->base_x(50); |
868 |
|
|
869 |
=cut |
=cut |
870 |
|
|
871 |
sub base62 { |
sub base_x { |
872 |
my $self = shift; |
my $self = shift; |
873 |
|
|
874 |
my $value = shift; |
my $value = shift; |
878 |
my @digits = qw( |
my @digits = qw( |
879 |
0 1 2 3 4 5 6 7 8 9 |
0 1 2 3 4 5 6 7 8 9 |
880 |
a b c d e f g h i j k l m n o p q r s t u v w x y z |
a b c d e f g h i j k l m n o p q r s t u v w x y z |
|
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z |
|
881 |
); |
); |
882 |
|
|
883 |
my $base = scalar(@digits); |
my $base = scalar(@digits); |
919 |
confess("path is undefined.") unless ($path); |
confess("path is undefined.") unless ($path); |
920 |
confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file)); |
confess("file is undefined. Did you call \$t->root->to_jsfind(..) instead of \$t->to_jsfind(..) ?") unless (defined($file)); |
921 |
|
|
922 |
$file = $self->base62($file); |
$file = $self->base_x($file); |
923 |
|
|
924 |
my $nr_keys = 0; |
my $nr_keys = 0; |
925 |
|
|