20 |
my $debug = 0; |
my $debug = 0; |
21 |
my $force = 0; |
my $force = 0; |
22 |
my $all = 0; |
my $all = 0; |
23 |
|
my $skip_images = 0; |
24 |
|
|
25 |
my $result = GetOptions( |
my $result = GetOptions( |
26 |
"collection=s" => \$collection, |
"collection=s" => \$collection, |
30 |
"exclude=s" => \$exclude, |
"exclude=s" => \$exclude, |
31 |
"force!" => \$force, |
"force!" => \$force, |
32 |
"all!" => \$all, |
"all!" => \$all, |
33 |
|
"skip-images!" => \$skip_images, |
34 |
); |
); |
35 |
|
|
36 |
my ($node_url,$dir) = @ARGV; |
my ($node_url,$dir) = @ARGV; |
42 |
--collection="name of collection" |
--collection="name of collection" |
43 |
--path=/path/to/add/at/end |
--path=/path/to/add/at/end |
44 |
--exclude=regex_to_exclude |
--exclude=regex_to_exclude |
45 |
|
--skip-images |
46 |
--verbose |
--verbose |
47 |
--force |
--force |
48 |
--debug |
--debug |
64 |
my $filter; |
my $filter; |
65 |
foreach my $f (qw/pdftotext pstotext/) { |
foreach my $f (qw/pdftotext pstotext/) { |
66 |
my $w = which($f); |
my $w = which($f); |
67 |
if ($f) { |
if ($w) { |
68 |
$filter->{$f} = $w; |
$filter->{$f} = $w; |
69 |
print STDERR "using $f filter at $w\n" if ($verbose); |
print STDERR "using $f filter at $w\n" if ($verbose); |
70 |
} |
} |
125 |
|
|
126 |
$n->{path} = $path; |
$n->{path} = $path; |
127 |
|
|
128 |
my $nr = $n->{ImageNumber} || next; |
my $nr = $n->{ImageNumber} || return $n; |
129 |
|
|
130 |
if ($nr == 27) { |
if ($nr == 27) { |
131 |
$meta->{title} = $n->{Name}; |
$meta->{title} = $n->{Name}; |
240 |
|
|
241 |
if ($contents) { |
if ($contents) { |
242 |
# html2text |
# html2text |
243 |
|
$contents =~ s#<script.*?</script>##gis; |
244 |
$contents =~ s#<[^>]+/*>##gs; |
$contents =~ s#<[^>]+/*>##gs; |
245 |
$contents =~ s#\s\s+# #gs; |
$contents =~ s#\s\s+# #gs; |
246 |
|
|
313 |
return if (! $all && -d $path); |
return if (! $all && -d $path); |
314 |
|
|
315 |
my $mtime = (stat($path))[9] || -1; |
my $mtime = (stat($path))[9] || -1; |
316 |
my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') }; |
my $mtime_db; |
317 |
|
eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') } unless ($force); |
318 |
$mtime_db ||= -2; |
$mtime_db ||= -2; |
319 |
|
|
320 |
if ($mtime == $mtime_db) { |
if ($mtime == $mtime_db) { |
349 |
warn "skipping '$path', no pstotext filter\n" if ($verbose); |
warn "skipping '$path', no pstotext filter\n" if ($verbose); |
350 |
return; |
return; |
351 |
} |
} |
352 |
|
} elsif ($type =~ m!^image/! && $skip_images) { |
353 |
|
warn "skipping image '$path'\n" if ($verbose); |
354 |
|
return; # don't index images |
355 |
} else { |
} else { |
356 |
|
|
357 |
# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); |
# return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i); |