7 |
use Text::Iconv; |
use Text::Iconv; |
8 |
#use File::MMagic; |
#use File::MMagic; |
9 |
use File::MMagic::XS qw/:compat/; |
use File::MMagic::XS qw/:compat/; |
10 |
|
use Time::HiRes qw/time/; |
11 |
|
|
12 |
my $collection; # name which will be inserted |
my $collection; # name which will be inserted |
13 |
my $path_add; # add additional info in path |
my $path_add; # add additional info in path |
50 |
#$basedir =~ s,/[^/]+$,/,; |
#$basedir =~ s,/[^/]+$,/,; |
51 |
#require "$basedir/filter.pm"; |
#require "$basedir/filter.pm"; |
52 |
|
|
53 |
|
my $docs = 0; |
54 |
|
my $start_t = time(); |
55 |
|
|
56 |
my $filter; |
my $filter; |
57 |
foreach my $f (qw/pdftotext pstotext/) { |
foreach my $f (qw/pdftotext pstotext/) { |
58 |
my $w = which($f); |
my $w = which($f); |
80 |
no_chdir => 1, |
no_chdir => 1, |
81 |
}, $dir); |
}, $dir); |
82 |
|
|
83 |
|
my $dur = (time() - $start_t) || 1; |
84 |
|
printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur); |
85 |
|
|
86 |
exit; |
exit; |
87 |
|
|
135 |
# register the document object to the database |
# register the document object to the database |
136 |
$db->put_doc($doc); |
$db->put_doc($doc); |
137 |
|
|
138 |
|
$docs++; |
139 |
|
|
140 |
} |
} |
141 |
|
|
142 |
sub filter_to_pages { |
sub filter_to_pages { |