/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 181 - (show annotations)
Sat Aug 26 22:33:34 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 8193 byte(s)
remove script from html when convertin it to text
1 #!/usr/bin/perl -w
2 use strict;
3 use File::Find;
4 use Getopt::Long;
5 use File::Which;
6 use Search::Estraier;
7 use Text::Iconv;
8 #use File::MMagic;
9 use File::MMagic::XS qw/:compat/;
10 use Time::HiRes qw/time/;
11 use HTML::TreeBuilder;
12 use Data::Dump qw/dump/;
13
14 my $collection; # name which will be inserted
15 my $path_add; # add additional info in path
16 my $verbose;
17 my $exclude;
18
19 #$verbose = 1;
20 my $debug = 0;
21 my $force = 0;
22 my $all = 0;
23
24 my $result = GetOptions(
25 "collection=s" => \$collection,
26 "path=s" => \$path_add,
27 "verbose!" => \$verbose,
28 "debug!" => \$debug,
29 "exclude=s" => \$exclude,
30 "force!" => \$force,
31 "all!" => \$all,
32 );
33
34 my ($node_url,$dir) = @ARGV;
35
36 die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
37 usage: $0 http://localhost:1978/node/my_dir /path/to/directory
38
39 options:
40 --collection="name of collection"
41 --path=/path/to/add/at/end
42 --exclude=regex_to_exclude
43 --verbose
44 --force
45 --debug
46 --all save placeholders for all files
47 _END_OF_USAGE_
48
49 if (! -e $dir) {
50 warn "directory $dir doesn't exist, skipping\n";
51 exit 1;
52 }
53
54 #my $basedir = $0;
55 #$basedir =~ s,/[^/]+$,/,;
56 #require "$basedir/filter.pm";
57
58 my $docs = 0;
59 my $start_t = time();
60
61 my $filter;
62 foreach my $f (qw/pdftotext pstotext/) {
63 my $w = which($f);
64 if ($w) {
65 $filter->{$f} = $w;
66 print STDERR "using $f filter at $w\n" if ($verbose);
67 }
68 }
69
70 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
71 my $mm = new File::MMagic::XS();
72
73 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
74
75 select(STDERR); $|=1;
76 select(STDOUT); $|=1;
77
78 my $db = new Search::Estraier::Node(
79 url => $node_url,
80 user => 'admin',
81 passwd => 'admin',
82 croak_on_error => 1,
83 create => 1,
84 );
85
86 #
87 # check if hhc file exists, and if it does, extract information from it
88 #
89
90 my $hhc_file;
91 # try to find hhc
92 find({ wanted => sub {
93 return unless( m!\.hhc$!i );
94 $hhc_file = $_;
95 warn "using $hhc_file for tree structure\n";
96 },
97 follow => 1,
98 follow_skip => 2,
99 no_chdir => 1,
100 }, $dir);
101
102 my $meta;
103
104 if ($hhc_file) {
105
106 sub param {
107 my ($el) = @_;
108
109 my $n;
110 foreach my $p ( $el->find('param') ) {
111 $n->{ $p->attr('name') } = $p->attr('value');
112 }
113
114 if ( ! defined($n->{Local}) ) {
115 warn "### skipped = ",dump($n),$/;
116 return;
117 }
118
119 my $path = $dir . '/' . $n->{Local};
120 $path =~ s!//!/!g;
121 $path = lc($path);
122
123 $n->{path} = $path;
124
125 my $nr = $n->{ImageNumber} || next;
126
127 if ($nr == 27) {
128 $meta->{title} = $n->{Name};
129 $meta->{index_path} = $path;
130 } elsif ($nr == 21) {
131 $meta->{toc_path} = $path;
132 } elsif ($nr == 1) {
133 $meta->{foreword_path} = $path;
134 } elsif ($nr == 11) {
135 # nop
136 } else {
137 warn "unknown ImageNumber: $nr\n";
138 }
139
140 return $n;
141 }
142
143 my $tree = HTML::TreeBuilder->new;
144 $tree->parse_file($hhc_file);
145
146 my $prefix = $collection ? ( $collection . ' :: ' ) : '';
147
148 my @prefix;
149 my $depth = 0;
150
151 foreach my $e ( $tree->look_down( sub { $_[0]->tag =~ m/(object)/ } ) ) {
152
153 # printf("%05s %s\n", $e->parent->address(), $e->as_HTML() );
154
155 my $l = ($e->depth() / 2) - 1;
156
157 $prefix[ 0 ] = $meta->{title} || '';
158
159 my $n = param($e);
160 $prefix[ $l ] = $n->{Name};
161
162 next unless ($n->{path});
163
164 my $t = '';
165 my @p;
166 foreach my $i ( 0 .. $l ) {
167 push @p, $prefix[ $i ] if ($prefix[ $i ]);
168 }
169 $t = join(' :: ', @p ) if (@p);
170
171 $meta->{path2title}->{ $n->{path} } = $t;
172
173 }
174
175 $tree->delete;
176
177 }
178
179
180 find({ wanted => \&file,
181 follow => 1,
182 follow_skip => 2,
183 no_chdir => 1,
184 }, $dir);
185
186 my $dur = (time() - $start_t) || 1;
187 printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur);
188
189 $db->master(
190 action => 'sync'
191 );
192
193
194 exit;
195
196 sub dump_contents {
197 my ($db,$contents,$mtime,$path,$size) = @_;
198
199 return unless (defined($contents)); # don't die on empty files
200
201 if ($exclude && $path =~ m/$exclude/i) {
202 print STDERR "skip: $path\n" if ($verbose);
203 return;
204 }
205
206 use bytes;
207 if (! $size) {
208 $size = length $contents;
209 }
210
211 print STDERR " [$size]" if ($verbose);
212
213 # create a document object
214 my $doc = new Search::Estraier::Document;
215
216 my $title;
217
218 if ( defined($meta->{path2title}->{lc($path)}) ) {
219 $title = $meta->{path2title}->{lc($path)};
220 warn " $title\n";
221 } else {
222
223 $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
224
225 # chop long titles to 100 chars
226 $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
227 # use path if no title is found
228 $title ||= $path;
229
230 }
231
232 # add attributes to the document object
233 $doc->add_attr('@uri', "file:///$path");
234 $doc->add_attr('@title', $iconv->convert($title));
235 $doc->add_attr('@size', $size);
236 $doc->add_attr('@mtime', $mtime);
237
238 if ($contents) {
239 # html2text
240 $contents =~ s#<script.*?</script>##gis;
241 $contents =~ s#<[^>]+/*>##gs;
242 $contents =~ s#\s\s+# #gs;
243
244 $doc->add_text($iconv->convert($contents));
245 }
246 # store path
247 $doc->add_hidden_text($path);
248 # boost title
249 $doc->add_hidden_text($title);
250
251 print $doc->dump_draft if ($debug);
252
253 # register the document object to the database
254 $db->put_doc($doc);
255
256 $docs++;
257
258 }
259
260 sub filter_to_pages {
261 my ($path, $mtime, $command) = @_;
262
263 print STDERR "$path {converting}" if ($verbose);
264
265 open(F,"$command |") || die "can't open $command with '$path': $!";
266 my $html;
267 while(<F>) {
268 $html .= $_;
269 }
270 close(F);
271
272 return if (! $html);
273
274 my $file_only = $path;
275 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
276
277 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
278
279 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
280
281 if ($collection) {
282 $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
283 } else {
284 $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
285 $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
286 }
287
288 # save empty entry as a placeholder
289 dump_contents($db, ' ', $mtime, "$path");
290
291 my $page_nr = 1;
292 foreach my $page (split(/\f/s,$pages)) {
293 print STDERR " $page_nr" if ($verbose);
294 my $pre_tmp = $pre_html;
295 $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
296 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
297 $page_nr++;
298 }
299
300
301
302 }
303
304 sub file {
305
306 my $path = $_;
307 my $contents;
308
309 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/);
310 return if (! $all && -d $path);
311
312 my $mtime = (stat($path))[9] || -1;
313 my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') };
314 $mtime_db ||= -2;
315
316 if ($mtime == $mtime_db) {
317 print STDERR "# same: $path $mtime\n" if ($verbose);
318 return unless($force);
319 } else {
320 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
321 }
322
323 # skip files on which File::MMagic::XS croaks
324 if ($path =~ m#\.au$#) {
325 warn "skipped '$path' to prevent File::MMagic::XS bug\n" if ($debug);
326 return;
327 }
328
329 my $type = $mm->checktype_filename($path);
330 $type =~ s/\s+/ /gs;
331
332 print STDERR "# $path $type\n" if ($debug);
333
334 if ($type =~ m/pdf/i) {
335 if ($filter->{pdftotext}) {
336 filter_to_pages($path, $mtime, qq( $filter->{pdftotext} -htmlmeta "$path" - ));
337 } else {
338 warn "skipping '$path', no pdftotext filter\n" if ($verbose);
339 return;
340 }
341 } elsif ($type eq 'application/postscript') {
342 if ($filter->{pstotext}) {
343 filter_to_pages($path, $mtime, qq( $filter->{pstotext} "$path" ));
344 } else {
345 warn "skipping '$path', no pstotext filter\n" if ($verbose);
346 return;
347 }
348 } else {
349
350 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
351 if (-f $path &&
352 $type !~ m/html/ &&
353 $path !~ m/\.(php|pl|txt|info|log|text)$/io
354 ) {
355 dump_contents($db, '', $mtime, $path, -s $path) if ($all);
356 return;
357 }
358
359 # skip index files
360 return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i);
361
362 open(F,"$path") || die "can't open file: $path";
363 print STDERR "$path ($type)" if ($verbose);
364 while(<F>) {
365 $contents .= "$_";
366 }
367 $contents .= "\n\n";
368
369 #$contents = filter($contents,$collection);
370
371 # add optional components to path
372 $path .= " $path_add" if ($path_add);
373
374 dump_contents($db, $contents, $mtime, $path);
375 }
376
377 print STDERR "\n" if ($verbose);
378 # die "zero size content in '$path'" if (! $contents);
379
380 }
381

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26