--- trunk/scripts/est-spider 2006/01/26 01:53:29 90 +++ trunk/scripts/est-spider 2006/04/17 10:38:17 121 @@ -7,6 +7,7 @@ use Text::Iconv; #use File::MMagic; use File::MMagic::XS qw/:compat/; +use Time::HiRes qw/time/; my $collection; # name which will be inserted my $path_add; # add additional info in path @@ -16,6 +17,7 @@ #$verbose = 1; my $debug = 0; my $force = 0; +my $all = 0; my $result = GetOptions( "collection=s" => \$collection, @@ -24,6 +26,7 @@ "debug!" => \$debug, "exclude=s" => \$exclude, "force!" => \$force, + "all!" => \$all, ); my ($node_url,$dir) = @ARGV; @@ -38,6 +41,7 @@ --verbose --force --debug + --all save placeholders for all files _END_OF_USAGE_ if (! -e $dir) { @@ -49,6 +53,9 @@ #$basedir =~ s,/[^/]+$,/,; #require "$basedir/filter.pm"; +my $docs = 0; +my $start_t = time(); + my $filter; foreach my $f (qw/pdftotext pstotext/) { my $w = which($f); @@ -66,9 +73,12 @@ select(STDERR); $|=1; select(STDOUT); $|=1; -my $db = new Search::Estraier::Node; -$db->set_url($node_url); -$db->set_auth('admin', 'admin'); +my $db = new Search::Estraier::Node( + url => $node_url, + user => 'admin', + passwd => 'admin', + croak_on_error => 1, +); find({ wanted => \&file, follow => 1, @@ -76,6 +86,8 @@ no_chdir => 1, }, $dir); +my $dur = (time() - $start_t) || 1; +printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur); exit; @@ -124,11 +136,13 @@ # boost title $doc->add_hidden_text($title); -# print $doc->dump_draft if ($verbose); + print $doc->dump_draft if ($debug); # register the document object to the database $db->put_doc($doc); + $docs++; + } sub filter_to_pages { @@ -180,10 +194,12 @@ my $path = $_; my $contents; - return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/); + return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/); + return if (! $all && -d $path); my $mtime = (stat($path))[9] || -1; - my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2; + my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') }; + $mtime_db ||= -2; if ($mtime == $mtime_db) { print STDERR "# same: $path $mtime\n" if ($verbose); @@ -220,15 +236,16 @@ } else { # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); - if (-f $path && $type =~ m/html/ || - ($path !~ m/\.(php|pl|txt|info|log|text)$/io) + if (-f $path && + $type !~ m/html/ && + $path !~ m/\.(php|pl|txt|info|log|text)$/io ) { - dump_contents($db, '', $mtime, $path, -s $path); + dump_contents($db, '', $mtime, $path, -s $path) if ($all); return; } # skip index files - return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i); + return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i); open(F,"$path") || die "can't open file: $path"; print STDERR "$path ($type)" if ($verbose);