/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 179 - (hide annotations)
Fri Aug 25 11:59:04 2006 UTC (17 years, 8 months ago) by dpavlin
File size: 8150 byte(s)
parse windows help file index (hhc) if available for page titles
1 dpavlin 64 #!/usr/bin/perl -w
2     use strict;
3     use File::Find;
4     use Getopt::Long;
5     use File::Which;
6     use Search::Estraier;
7     use Text::Iconv;
8     #use File::MMagic;
9     use File::MMagic::XS qw/:compat/;
10 dpavlin 118 use Time::HiRes qw/time/;
11 dpavlin 179 use HTML::TreeBuilder;
12     use Data::Dump qw/dump/;
13 dpavlin 64
14     my $collection; # name which will be inserted
15     my $path_add; # add additional info in path
16     my $verbose;
17     my $exclude;
18    
19     #$verbose = 1;
20     my $debug = 0;
21     my $force = 0;
22 dpavlin 119 my $all = 0;
23 dpavlin 64
24     my $result = GetOptions(
25     "collection=s" => \$collection,
26     "path=s" => \$path_add,
27     "verbose!" => \$verbose,
28     "debug!" => \$debug,
29     "exclude=s" => \$exclude,
30     "force!" => \$force,
31 dpavlin 119 "all!" => \$all,
32 dpavlin 64 );
33    
34 dpavlin 89 my ($node_url,$dir) = @ARGV;
35 dpavlin 64
36 dpavlin 89 die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
37     usage: $0 http://localhost:1978/node/my_dir /path/to/directory
38    
39     options:
40     --collection="name of collection"
41     --path=/path/to/add/at/end
42     --exclude=regex_to_exclude
43     --verbose
44     --force
45     --debug
46 dpavlin 119 --all save placeholders for all files
47 dpavlin 89 _END_OF_USAGE_
48    
49 dpavlin 64 if (! -e $dir) {
50     warn "directory $dir doesn't exist, skipping\n";
51     exit 1;
52     }
53    
54     #my $basedir = $0;
55     #$basedir =~ s,/[^/]+$,/,;
56     #require "$basedir/filter.pm";
57    
58 dpavlin 118 my $docs = 0;
59     my $start_t = time();
60    
61 dpavlin 90 my $filter;
62     foreach my $f (qw/pdftotext pstotext/) {
63     my $w = which($f);
64     if ($f) {
65     $filter->{$f} = $w;
66     print STDERR "using $f filter at $w\n" if ($verbose);
67     }
68     }
69 dpavlin 64
70     #my $mm = new File::MMagic('/usr/share/misc/file/magic');
71     my $mm = new File::MMagic::XS();
72    
73     my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
74    
75     select(STDERR); $|=1;
76     select(STDOUT); $|=1;
77    
78 dpavlin 119 my $db = new Search::Estraier::Node(
79     url => $node_url,
80     user => 'admin',
81     passwd => 'admin',
82     croak_on_error => 1,
83 dpavlin 179 create => 1,
84 dpavlin 119 );
85 dpavlin 64
86 dpavlin 179 #
87     # check if hhc file exists, and if it does, extract information from it
88     #
89    
90     my $hhc_file;
91     # try to find hhc
92     find({ wanted => sub {
93     return unless( m!\.hhc$!i );
94     $hhc_file = $_;
95     warn "using $hhc_file for tree structure\n";
96     },
97     follow => 1,
98     follow_skip => 2,
99     no_chdir => 1,
100     }, $dir);
101    
102     my $meta;
103    
104     if ($hhc_file) {
105    
106     sub param {
107     my ($el) = @_;
108    
109     my $n;
110     foreach my $p ( $el->find('param') ) {
111     $n->{ $p->attr('name') } = $p->attr('value');
112     }
113    
114     if ( ! defined($n->{Local}) ) {
115     warn "### skipped = ",dump($n),$/;
116     return;
117     }
118    
119     my $path = $dir . '/' . $n->{Local};
120     $path =~ s!//!/!g;
121     $path = lc($path);
122    
123     $n->{path} = $path;
124    
125     my $nr = $n->{ImageNumber} || next;
126    
127     if ($nr == 27) {
128     $meta->{title} = $n->{Name};
129     $meta->{index_path} = $path;
130     } elsif ($nr == 21) {
131     $meta->{toc_path} = $path;
132     } elsif ($nr == 1) {
133     $meta->{foreword_path} = $path;
134     } elsif ($nr == 11) {
135     # nop
136     } else {
137     warn "unknown ImageNumber: $nr\n";
138     }
139    
140     return $n;
141     }
142    
143     my $tree = HTML::TreeBuilder->new;
144     $tree->parse_file($hhc_file);
145    
146     my $prefix = $collection ? ( $collection . ' :: ' ) : '';
147    
148     my @prefix;
149     my $depth = 0;
150    
151     foreach my $e ( $tree->look_down( sub { $_[0]->tag =~ m/(object)/ } ) ) {
152    
153     # printf("%05s %s\n", $e->parent->address(), $e->as_HTML() );
154    
155     my $l = ($e->depth() / 2) - 1;
156    
157     $prefix[ 0 ] = $meta->{title} || '';
158    
159     my $n = param($e);
160     $prefix[ $l ] = $n->{Name};
161    
162     next unless ($n->{path});
163    
164     my $t = '';
165     my @p;
166     foreach my $i ( 0 .. $l ) {
167     push @p, $prefix[ $i ] if ($prefix[ $i ]);
168     }
169     $t = join(' :: ', @p ) if (@p);
170    
171     $meta->{path2title}->{ $n->{path} } = $t;
172    
173     }
174    
175     $tree->delete;
176    
177     }
178    
179    
180 dpavlin 64 find({ wanted => \&file,
181     follow => 1,
182     follow_skip => 2,
183     no_chdir => 1,
184     }, $dir);
185    
186 dpavlin 118 my $dur = (time() - $start_t) || 1;
187     printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur);
188 dpavlin 64
189 dpavlin 178 $db->master(
190     action => 'sync'
191     );
192    
193    
194 dpavlin 64 exit;
195    
196 dpavlin 90 sub dump_contents {
197     my ($db,$contents,$mtime,$path,$size) = @_;
198 dpavlin 64
199 dpavlin 90 return unless (defined($contents)); # don't die on empty files
200 dpavlin 64
201     if ($exclude && $path =~ m/$exclude/i) {
202     print STDERR "skip: $path\n" if ($verbose);
203     return;
204     }
205    
206     use bytes;
207 dpavlin 90 if (! $size) {
208     $size = length $contents;
209     }
210 dpavlin 64
211     print STDERR " [$size]" if ($verbose);
212    
213     # create a document object
214 dpavlin 89 my $doc = new Search::Estraier::Document;
215 dpavlin 64
216 dpavlin 179 my $title;
217 dpavlin 64
218 dpavlin 179 if ( defined($meta->{path2title}->{lc($path)}) ) {
219     $title = $meta->{path2title}->{lc($path)};
220     warn " $title\n";
221     } else {
222 dpavlin 64
223 dpavlin 179 $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
224    
225     # chop long titles to 100 chars
226     $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
227     # use path if no title is found
228     $title ||= $path;
229    
230     }
231    
232 dpavlin 64 # add attributes to the document object
233     $doc->add_attr('@uri', "file:///$path");
234     $doc->add_attr('@title', $iconv->convert($title));
235     $doc->add_attr('@size', $size);
236     $doc->add_attr('@mtime', $mtime);
237    
238 dpavlin 90 if ($contents) {
239     # html2text
240     $contents =~ s#<[^>]+/*>##gs;
241     $contents =~ s#\s\s+# #gs;
242 dpavlin 64
243 dpavlin 90 $doc->add_text($iconv->convert($contents));
244     }
245     # store path
246     $doc->add_hidden_text($path);
247     # boost title
248     $doc->add_hidden_text($title);
249 dpavlin 64
250 dpavlin 120 print $doc->dump_draft if ($debug);
251 dpavlin 64
252     # register the document object to the database
253 dpavlin 89 $db->put_doc($doc);
254 dpavlin 64
255 dpavlin 118 $docs++;
256    
257 dpavlin 64 }
258    
259 dpavlin 90 sub filter_to_pages {
260     my ($path, $mtime, $command) = @_;
261 dpavlin 64
262     print STDERR "$path {converting}" if ($verbose);
263    
264 dpavlin 90 open(F,"$command |") || die "can't open $command with '$path': $!";
265 dpavlin 64 my $html;
266     while(<F>) {
267     $html .= $_;
268     }
269     close(F);
270    
271     return if (! $html);
272    
273     my $file_only = $path;
274     $file_only =~ s/^.*\/([^\/]+)$/$1/g;
275    
276     my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
277    
278     ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
279    
280     if ($collection) {
281     $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
282     } else {
283     $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
284     $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
285     }
286    
287     # save empty entry as a placeholder
288     dump_contents($db, ' ', $mtime, "$path");
289    
290     my $page_nr = 1;
291     foreach my $page (split(/\f/s,$pages)) {
292     print STDERR " $page_nr" if ($verbose);
293     my $pre_tmp = $pre_html;
294     $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
295     dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
296     $page_nr++;
297     }
298    
299 dpavlin 90
300    
301     }
302    
303     sub file {
304    
305     my $path = $_;
306     my $contents;
307    
308 dpavlin 119 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/);
309 dpavlin 121 return if (! $all && -d $path);
310 dpavlin 90
311     my $mtime = (stat($path))[9] || -1;
312 dpavlin 119 my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') };
313     $mtime_db ||= -2;
314 dpavlin 90
315     if ($mtime == $mtime_db) {
316     print STDERR "# same: $path $mtime\n" if ($verbose);
317     return unless($force);
318 dpavlin 64 } else {
319 dpavlin 90 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
320     }
321 dpavlin 64
322 dpavlin 90 # skip files on which File::MMagic::XS croaks
323     if ($path =~ m#\.au$#) {
324     warn "skipped '$path' to prevent File::MMagic::XS bug\n" if ($debug);
325     return;
326     }
327    
328     my $type = $mm->checktype_filename($path);
329     $type =~ s/\s+/ /gs;
330    
331     print STDERR "# $path $type\n" if ($debug);
332    
333     if ($type =~ m/pdf/i) {
334     if ($filter->{pdftotext}) {
335     filter_to_pages($path, $mtime, qq( $filter->{pdftotext} -htmlmeta "$path" - ));
336     } else {
337     warn "skipping '$path', no pdftotext filter\n" if ($verbose);
338     return;
339     }
340     } elsif ($type eq 'application/postscript') {
341     if ($filter->{pstotext}) {
342     filter_to_pages($path, $mtime, qq( $filter->{pstotext} "$path" ));
343     } else {
344     warn "skipping '$path', no pstotext filter\n" if ($verbose);
345     return;
346     }
347     } else {
348    
349 dpavlin 64 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
350 dpavlin 119 if (-f $path &&
351     $type !~ m/html/ &&
352     $path !~ m/\.(php|pl|txt|info|log|text)$/io
353 dpavlin 77 ) {
354 dpavlin 119 dump_contents($db, '', $mtime, $path, -s $path) if ($all);
355 dpavlin 90 return;
356 dpavlin 77 }
357 dpavlin 64
358     # skip index files
359 dpavlin 119 return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i);
360 dpavlin 64
361     open(F,"$path") || die "can't open file: $path";
362     print STDERR "$path ($type)" if ($verbose);
363     while(<F>) {
364     $contents .= "$_";
365     }
366     $contents .= "\n\n";
367    
368     #$contents = filter($contents,$collection);
369    
370     # add optional components to path
371     $path .= " $path_add" if ($path_add);
372    
373     dump_contents($db, $contents, $mtime, $path);
374     }
375    
376     print STDERR "\n" if ($verbose);
377     # die "zero size content in '$path'" if (! $contents);
378    
379     }
380    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26