--- trunk/scripts/est-spider 2006/08/25 11:59:04 179 +++ trunk/scripts/est-spider 2007/01/05 22:19:01 197 @@ -20,6 +20,7 @@ my $debug = 0; my $force = 0; my $all = 0; +my $skip_images = 0; my $result = GetOptions( "collection=s" => \$collection, @@ -29,6 +30,7 @@ "exclude=s" => \$exclude, "force!" => \$force, "all!" => \$all, + "skip-images!" => \$skip_images, ); my ($node_url,$dir) = @ARGV; @@ -40,6 +42,7 @@ --collection="name of collection" --path=/path/to/add/at/end --exclude=regex_to_exclude + --skip-images --verbose --force --debug @@ -61,7 +64,7 @@ my $filter; foreach my $f (qw/pdftotext pstotext/) { my $w = which($f); - if ($f) { + if ($w) { $filter->{$f} = $w; print STDERR "using $f filter at $w\n" if ($verbose); } @@ -122,7 +125,7 @@ $n->{path} = $path; - my $nr = $n->{ImageNumber} || next; + my $nr = $n->{ImageNumber} || return $n; if ($nr == 27) { $meta->{title} = $n->{Name}; @@ -237,6 +240,7 @@ if ($contents) { # html2text + $contents =~ s###gis; $contents =~ s#<[^>]+/*>##gs; $contents =~ s#\s\s+# #gs; @@ -309,7 +313,8 @@ return if (! $all && -d $path); my $mtime = (stat($path))[9] || -1; - my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') }; + my $mtime_db; + eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') } unless ($force); $mtime_db ||= -2; if ($mtime == $mtime_db) { @@ -344,6 +349,9 @@ warn "skipping '$path', no pstotext filter\n" if ($verbose); return; } + } elsif ($type =~ m!^image/! && $skip_images) { + warn "skipping image '$path'\n" if ($verbose); + return; # don't index images } else { # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);