/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 179 - (show annotations)
Fri Aug 25 11:59:04 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 8150 byte(s)
parse windows help file index (hhc) if available for page titles
1 #!/usr/bin/perl -w
2 use strict;
3 use File::Find;
4 use Getopt::Long;
5 use File::Which;
6 use Search::Estraier;
7 use Text::Iconv;
8 #use File::MMagic;
9 use File::MMagic::XS qw/:compat/;
10 use Time::HiRes qw/time/;
11 use HTML::TreeBuilder;
12 use Data::Dump qw/dump/;
13
14 my $collection; # name which will be inserted
15 my $path_add; # add additional info in path
16 my $verbose;
17 my $exclude;
18
19 #$verbose = 1;
20 my $debug = 0;
21 my $force = 0;
22 my $all = 0;
23
24 my $result = GetOptions(
25 "collection=s" => \$collection,
26 "path=s" => \$path_add,
27 "verbose!" => \$verbose,
28 "debug!" => \$debug,
29 "exclude=s" => \$exclude,
30 "force!" => \$force,
31 "all!" => \$all,
32 );
33
34 my ($node_url,$dir) = @ARGV;
35
36 die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
37 usage: $0 http://localhost:1978/node/my_dir /path/to/directory
38
39 options:
40 --collection="name of collection"
41 --path=/path/to/add/at/end
42 --exclude=regex_to_exclude
43 --verbose
44 --force
45 --debug
46 --all save placeholders for all files
47 _END_OF_USAGE_
48
49 if (! -e $dir) {
50 warn "directory $dir doesn't exist, skipping\n";
51 exit 1;
52 }
53
54 #my $basedir = $0;
55 #$basedir =~ s,/[^/]+$,/,;
56 #require "$basedir/filter.pm";
57
58 my $docs = 0;
59 my $start_t = time();
60
61 my $filter;
62 foreach my $f (qw/pdftotext pstotext/) {
63 my $w = which($f);
64 if ($f) {
65 $filter->{$f} = $w;
66 print STDERR "using $f filter at $w\n" if ($verbose);
67 }
68 }
69
70 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
71 my $mm = new File::MMagic::XS();
72
73 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
74
75 select(STDERR); $|=1;
76 select(STDOUT); $|=1;
77
78 my $db = new Search::Estraier::Node(
79 url => $node_url,
80 user => 'admin',
81 passwd => 'admin',
82 croak_on_error => 1,
83 create => 1,
84 );
85
86 #
87 # check if hhc file exists, and if it does, extract information from it
88 #
89
90 my $hhc_file;
91 # try to find hhc
92 find({ wanted => sub {
93 return unless( m!\.hhc$!i );
94 $hhc_file = $_;
95 warn "using $hhc_file for tree structure\n";
96 },
97 follow => 1,
98 follow_skip => 2,
99 no_chdir => 1,
100 }, $dir);
101
102 my $meta;
103
104 if ($hhc_file) {
105
106 sub param {
107 my ($el) = @_;
108
109 my $n;
110 foreach my $p ( $el->find('param') ) {
111 $n->{ $p->attr('name') } = $p->attr('value');
112 }
113
114 if ( ! defined($n->{Local}) ) {
115 warn "### skipped = ",dump($n),$/;
116 return;
117 }
118
119 my $path = $dir . '/' . $n->{Local};
120 $path =~ s!//!/!g;
121 $path = lc($path);
122
123 $n->{path} = $path;
124
125 my $nr = $n->{ImageNumber} || next;
126
127 if ($nr == 27) {
128 $meta->{title} = $n->{Name};
129 $meta->{index_path} = $path;
130 } elsif ($nr == 21) {
131 $meta->{toc_path} = $path;
132 } elsif ($nr == 1) {
133 $meta->{foreword_path} = $path;
134 } elsif ($nr == 11) {
135 # nop
136 } else {
137 warn "unknown ImageNumber: $nr\n";
138 }
139
140 return $n;
141 }
142
143 my $tree = HTML::TreeBuilder->new;
144 $tree->parse_file($hhc_file);
145
146 my $prefix = $collection ? ( $collection . ' :: ' ) : '';
147
148 my @prefix;
149 my $depth = 0;
150
151 foreach my $e ( $tree->look_down( sub { $_[0]->tag =~ m/(object)/ } ) ) {
152
153 # printf("%05s %s\n", $e->parent->address(), $e->as_HTML() );
154
155 my $l = ($e->depth() / 2) - 1;
156
157 $prefix[ 0 ] = $meta->{title} || '';
158
159 my $n = param($e);
160 $prefix[ $l ] = $n->{Name};
161
162 next unless ($n->{path});
163
164 my $t = '';
165 my @p;
166 foreach my $i ( 0 .. $l ) {
167 push @p, $prefix[ $i ] if ($prefix[ $i ]);
168 }
169 $t = join(' :: ', @p ) if (@p);
170
171 $meta->{path2title}->{ $n->{path} } = $t;
172
173 }
174
175 $tree->delete;
176
177 }
178
179
180 find({ wanted => \&file,
181 follow => 1,
182 follow_skip => 2,
183 no_chdir => 1,
184 }, $dir);
185
186 my $dur = (time() - $start_t) || 1;
187 printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur);
188
189 $db->master(
190 action => 'sync'
191 );
192
193
194 exit;
195
196 sub dump_contents {
197 my ($db,$contents,$mtime,$path,$size) = @_;
198
199 return unless (defined($contents)); # don't die on empty files
200
201 if ($exclude && $path =~ m/$exclude/i) {
202 print STDERR "skip: $path\n" if ($verbose);
203 return;
204 }
205
206 use bytes;
207 if (! $size) {
208 $size = length $contents;
209 }
210
211 print STDERR " [$size]" if ($verbose);
212
213 # create a document object
214 my $doc = new Search::Estraier::Document;
215
216 my $title;
217
218 if ( defined($meta->{path2title}->{lc($path)}) ) {
219 $title = $meta->{path2title}->{lc($path)};
220 warn " $title\n";
221 } else {
222
223 $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
224
225 # chop long titles to 100 chars
226 $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
227 # use path if no title is found
228 $title ||= $path;
229
230 }
231
232 # add attributes to the document object
233 $doc->add_attr('@uri', "file:///$path");
234 $doc->add_attr('@title', $iconv->convert($title));
235 $doc->add_attr('@size', $size);
236 $doc->add_attr('@mtime', $mtime);
237
238 if ($contents) {
239 # html2text
240 $contents =~ s#<[^>]+/*>##gs;
241 $contents =~ s#\s\s+# #gs;
242
243 $doc->add_text($iconv->convert($contents));
244 }
245 # store path
246 $doc->add_hidden_text($path);
247 # boost title
248 $doc->add_hidden_text($title);
249
250 print $doc->dump_draft if ($debug);
251
252 # register the document object to the database
253 $db->put_doc($doc);
254
255 $docs++;
256
257 }
258
259 sub filter_to_pages {
260 my ($path, $mtime, $command) = @_;
261
262 print STDERR "$path {converting}" if ($verbose);
263
264 open(F,"$command |") || die "can't open $command with '$path': $!";
265 my $html;
266 while(<F>) {
267 $html .= $_;
268 }
269 close(F);
270
271 return if (! $html);
272
273 my $file_only = $path;
274 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
275
276 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
277
278 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
279
280 if ($collection) {
281 $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
282 } else {
283 $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
284 $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
285 }
286
287 # save empty entry as a placeholder
288 dump_contents($db, ' ', $mtime, "$path");
289
290 my $page_nr = 1;
291 foreach my $page (split(/\f/s,$pages)) {
292 print STDERR " $page_nr" if ($verbose);
293 my $pre_tmp = $pre_html;
294 $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
295 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
296 $page_nr++;
297 }
298
299
300
301 }
302
303 sub file {
304
305 my $path = $_;
306 my $contents;
307
308 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/);
309 return if (! $all && -d $path);
310
311 my $mtime = (stat($path))[9] || -1;
312 my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') };
313 $mtime_db ||= -2;
314
315 if ($mtime == $mtime_db) {
316 print STDERR "# same: $path $mtime\n" if ($verbose);
317 return unless($force);
318 } else {
319 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
320 }
321
322 # skip files on which File::MMagic::XS croaks
323 if ($path =~ m#\.au$#) {
324 warn "skipped '$path' to prevent File::MMagic::XS bug\n" if ($debug);
325 return;
326 }
327
328 my $type = $mm->checktype_filename($path);
329 $type =~ s/\s+/ /gs;
330
331 print STDERR "# $path $type\n" if ($debug);
332
333 if ($type =~ m/pdf/i) {
334 if ($filter->{pdftotext}) {
335 filter_to_pages($path, $mtime, qq( $filter->{pdftotext} -htmlmeta "$path" - ));
336 } else {
337 warn "skipping '$path', no pdftotext filter\n" if ($verbose);
338 return;
339 }
340 } elsif ($type eq 'application/postscript') {
341 if ($filter->{pstotext}) {
342 filter_to_pages($path, $mtime, qq( $filter->{pstotext} "$path" ));
343 } else {
344 warn "skipping '$path', no pstotext filter\n" if ($verbose);
345 return;
346 }
347 } else {
348
349 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
350 if (-f $path &&
351 $type !~ m/html/ &&
352 $path !~ m/\.(php|pl|txt|info|log|text)$/io
353 ) {
354 dump_contents($db, '', $mtime, $path, -s $path) if ($all);
355 return;
356 }
357
358 # skip index files
359 return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i);
360
361 open(F,"$path") || die "can't open file: $path";
362 print STDERR "$path ($type)" if ($verbose);
363 while(<F>) {
364 $contents .= "$_";
365 }
366 $contents .= "\n\n";
367
368 #$contents = filter($contents,$collection);
369
370 # add optional components to path
371 $path .= " $path_add" if ($path_add);
372
373 dump_contents($db, $contents, $mtime, $path);
374 }
375
376 print STDERR "\n" if ($verbose);
377 # die "zero size content in '$path'" if (! $contents);
378
379 }
380

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26