/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 196 - (hide annotations)
Sun Nov 26 12:06:08 2006 UTC (17 years, 4 months ago) by dpavlin
File size: 8408 byte(s)
added --skip-images option
1 dpavlin 64 #!/usr/bin/perl -w
2     use strict;
3     use File::Find;
4     use Getopt::Long;
5     use File::Which;
6     use Search::Estraier;
7     use Text::Iconv;
8     #use File::MMagic;
9     use File::MMagic::XS qw/:compat/;
10 dpavlin 118 use Time::HiRes qw/time/;
11 dpavlin 179 use HTML::TreeBuilder;
12     use Data::Dump qw/dump/;
13 dpavlin 64
14     my $collection; # name which will be inserted
15     my $path_add; # add additional info in path
16     my $verbose;
17     my $exclude;
18    
19     #$verbose = 1;
20     my $debug = 0;
21     my $force = 0;
22 dpavlin 119 my $all = 0;
23 dpavlin 196 my $skip_images = 0;
24 dpavlin 64
25     my $result = GetOptions(
26     "collection=s" => \$collection,
27     "path=s" => \$path_add,
28     "verbose!" => \$verbose,
29     "debug!" => \$debug,
30     "exclude=s" => \$exclude,
31     "force!" => \$force,
32 dpavlin 119 "all!" => \$all,
33 dpavlin 196 "skip-images!" => \$skip_images,
34 dpavlin 64 );
35    
36 dpavlin 89 my ($node_url,$dir) = @ARGV;
37 dpavlin 64
38 dpavlin 89 die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
39     usage: $0 http://localhost:1978/node/my_dir /path/to/directory
40    
41     options:
42     --collection="name of collection"
43     --path=/path/to/add/at/end
44     --exclude=regex_to_exclude
45 dpavlin 196 --skip-images
46 dpavlin 89 --verbose
47     --force
48     --debug
49 dpavlin 119 --all save placeholders for all files
50 dpavlin 89 _END_OF_USAGE_
51    
52 dpavlin 64 if (! -e $dir) {
53     warn "directory $dir doesn't exist, skipping\n";
54     exit 1;
55     }
56    
57     #my $basedir = $0;
58     #$basedir =~ s,/[^/]+$,/,;
59     #require "$basedir/filter.pm";
60    
61 dpavlin 118 my $docs = 0;
62     my $start_t = time();
63    
64 dpavlin 90 my $filter;
65     foreach my $f (qw/pdftotext pstotext/) {
66     my $w = which($f);
67 dpavlin 180 if ($w) {
68 dpavlin 90 $filter->{$f} = $w;
69     print STDERR "using $f filter at $w\n" if ($verbose);
70     }
71     }
72 dpavlin 64
73     #my $mm = new File::MMagic('/usr/share/misc/file/magic');
74     my $mm = new File::MMagic::XS();
75    
76     my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
77    
78     select(STDERR); $|=1;
79     select(STDOUT); $|=1;
80    
81 dpavlin 119 my $db = new Search::Estraier::Node(
82     url => $node_url,
83     user => 'admin',
84     passwd => 'admin',
85     croak_on_error => 1,
86 dpavlin 179 create => 1,
87 dpavlin 119 );
88 dpavlin 64
89 dpavlin 179 #
90     # check if hhc file exists, and if it does, extract information from it
91     #
92    
93     my $hhc_file;
94     # try to find hhc
95     find({ wanted => sub {
96     return unless( m!\.hhc$!i );
97     $hhc_file = $_;
98     warn "using $hhc_file for tree structure\n";
99     },
100     follow => 1,
101     follow_skip => 2,
102     no_chdir => 1,
103     }, $dir);
104    
105     my $meta;
106    
107     if ($hhc_file) {
108    
109     sub param {
110     my ($el) = @_;
111    
112     my $n;
113     foreach my $p ( $el->find('param') ) {
114     $n->{ $p->attr('name') } = $p->attr('value');
115     }
116    
117     if ( ! defined($n->{Local}) ) {
118     warn "### skipped = ",dump($n),$/;
119     return;
120     }
121    
122     my $path = $dir . '/' . $n->{Local};
123     $path =~ s!//!/!g;
124     $path = lc($path);
125    
126     $n->{path} = $path;
127    
128     my $nr = $n->{ImageNumber} || next;
129    
130     if ($nr == 27) {
131     $meta->{title} = $n->{Name};
132     $meta->{index_path} = $path;
133     } elsif ($nr == 21) {
134     $meta->{toc_path} = $path;
135     } elsif ($nr == 1) {
136     $meta->{foreword_path} = $path;
137     } elsif ($nr == 11) {
138     # nop
139     } else {
140     warn "unknown ImageNumber: $nr\n";
141     }
142    
143     return $n;
144     }
145    
146     my $tree = HTML::TreeBuilder->new;
147     $tree->parse_file($hhc_file);
148    
149     my $prefix = $collection ? ( $collection . ' :: ' ) : '';
150    
151     my @prefix;
152     my $depth = 0;
153    
154     foreach my $e ( $tree->look_down( sub { $_[0]->tag =~ m/(object)/ } ) ) {
155    
156     # printf("%05s %s\n", $e->parent->address(), $e->as_HTML() );
157    
158     my $l = ($e->depth() / 2) - 1;
159    
160     $prefix[ 0 ] = $meta->{title} || '';
161    
162     my $n = param($e);
163     $prefix[ $l ] = $n->{Name};
164    
165     next unless ($n->{path});
166    
167     my $t = '';
168     my @p;
169     foreach my $i ( 0 .. $l ) {
170     push @p, $prefix[ $i ] if ($prefix[ $i ]);
171     }
172     $t = join(' :: ', @p ) if (@p);
173    
174     $meta->{path2title}->{ $n->{path} } = $t;
175    
176     }
177    
178     $tree->delete;
179    
180     }
181    
182    
183 dpavlin 64 find({ wanted => \&file,
184     follow => 1,
185     follow_skip => 2,
186     no_chdir => 1,
187     }, $dir);
188    
189 dpavlin 118 my $dur = (time() - $start_t) || 1;
190     printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur);
191 dpavlin 64
192 dpavlin 178 $db->master(
193     action => 'sync'
194     );
195    
196    
197 dpavlin 64 exit;
198    
199 dpavlin 90 sub dump_contents {
200     my ($db,$contents,$mtime,$path,$size) = @_;
201 dpavlin 64
202 dpavlin 90 return unless (defined($contents)); # don't die on empty files
203 dpavlin 64
204     if ($exclude && $path =~ m/$exclude/i) {
205     print STDERR "skip: $path\n" if ($verbose);
206     return;
207     }
208    
209     use bytes;
210 dpavlin 90 if (! $size) {
211     $size = length $contents;
212     }
213 dpavlin 64
214     print STDERR " [$size]" if ($verbose);
215    
216     # create a document object
217 dpavlin 89 my $doc = new Search::Estraier::Document;
218 dpavlin 64
219 dpavlin 179 my $title;
220 dpavlin 64
221 dpavlin 179 if ( defined($meta->{path2title}->{lc($path)}) ) {
222     $title = $meta->{path2title}->{lc($path)};
223     warn " $title\n";
224     } else {
225 dpavlin 64
226 dpavlin 179 $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
227    
228     # chop long titles to 100 chars
229     $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
230     # use path if no title is found
231     $title ||= $path;
232    
233     }
234    
235 dpavlin 64 # add attributes to the document object
236     $doc->add_attr('@uri', "file:///$path");
237     $doc->add_attr('@title', $iconv->convert($title));
238     $doc->add_attr('@size', $size);
239     $doc->add_attr('@mtime', $mtime);
240    
241 dpavlin 90 if ($contents) {
242     # html2text
243 dpavlin 181 $contents =~ s#<script.*?</script>##gis;
244 dpavlin 90 $contents =~ s#<[^>]+/*>##gs;
245     $contents =~ s#\s\s+# #gs;
246 dpavlin 64
247 dpavlin 90 $doc->add_text($iconv->convert($contents));
248     }
249     # store path
250     $doc->add_hidden_text($path);
251     # boost title
252     $doc->add_hidden_text($title);
253 dpavlin 64
254 dpavlin 120 print $doc->dump_draft if ($debug);
255 dpavlin 64
256     # register the document object to the database
257 dpavlin 89 $db->put_doc($doc);
258 dpavlin 64
259 dpavlin 118 $docs++;
260    
261 dpavlin 64 }
262    
263 dpavlin 90 sub filter_to_pages {
264     my ($path, $mtime, $command) = @_;
265 dpavlin 64
266     print STDERR "$path {converting}" if ($verbose);
267    
268 dpavlin 90 open(F,"$command |") || die "can't open $command with '$path': $!";
269 dpavlin 64 my $html;
270     while(<F>) {
271     $html .= $_;
272     }
273     close(F);
274    
275     return if (! $html);
276    
277     my $file_only = $path;
278     $file_only =~ s/^.*\/([^\/]+)$/$1/g;
279    
280     my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
281    
282     ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
283    
284     if ($collection) {
285     $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
286     } else {
287     $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
288     $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
289     }
290    
291     # save empty entry as a placeholder
292     dump_contents($db, ' ', $mtime, "$path");
293    
294     my $page_nr = 1;
295     foreach my $page (split(/\f/s,$pages)) {
296     print STDERR " $page_nr" if ($verbose);
297     my $pre_tmp = $pre_html;
298     $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
299     dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
300     $page_nr++;
301     }
302    
303 dpavlin 90
304    
305     }
306    
307     sub file {
308    
309     my $path = $_;
310     my $contents;
311    
312 dpavlin 119 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/);
313 dpavlin 121 return if (! $all && -d $path);
314 dpavlin 90
315     my $mtime = (stat($path))[9] || -1;
316 dpavlin 182 my $mtime_db;
317     eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') } unless ($force);
318 dpavlin 119 $mtime_db ||= -2;
319 dpavlin 90
320     if ($mtime == $mtime_db) {
321     print STDERR "# same: $path $mtime\n" if ($verbose);
322     return unless($force);
323 dpavlin 64 } else {
324 dpavlin 90 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
325     }
326 dpavlin 64
327 dpavlin 90 # skip files on which File::MMagic::XS croaks
328     if ($path =~ m#\.au$#) {
329     warn "skipped '$path' to prevent File::MMagic::XS bug\n" if ($debug);
330     return;
331     }
332    
333     my $type = $mm->checktype_filename($path);
334     $type =~ s/\s+/ /gs;
335    
336     print STDERR "# $path $type\n" if ($debug);
337    
338     if ($type =~ m/pdf/i) {
339     if ($filter->{pdftotext}) {
340     filter_to_pages($path, $mtime, qq( $filter->{pdftotext} -htmlmeta "$path" - ));
341     } else {
342     warn "skipping '$path', no pdftotext filter\n" if ($verbose);
343     return;
344     }
345     } elsif ($type eq 'application/postscript') {
346     if ($filter->{pstotext}) {
347     filter_to_pages($path, $mtime, qq( $filter->{pstotext} "$path" ));
348     } else {
349     warn "skipping '$path', no pstotext filter\n" if ($verbose);
350     return;
351     }
352 dpavlin 196 } elsif ($type =~ m!^image/! && $skip_images) {
353     warn "skipping image '$path'\n" if ($verbose);
354     return; # don't index images
355 dpavlin 90 } else {
356    
357 dpavlin 64 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
358 dpavlin 119 if (-f $path &&
359     $type !~ m/html/ &&
360     $path !~ m/\.(php|pl|txt|info|log|text)$/io
361 dpavlin 77 ) {
362 dpavlin 119 dump_contents($db, '', $mtime, $path, -s $path) if ($all);
363 dpavlin 90 return;
364 dpavlin 77 }
365 dpavlin 64
366     # skip index files
367 dpavlin 119 return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i);
368 dpavlin 64
369     open(F,"$path") || die "can't open file: $path";
370     print STDERR "$path ($type)" if ($verbose);
371     while(<F>) {
372     $contents .= "$_";
373     }
374     $contents .= "\n\n";
375    
376     #$contents = filter($contents,$collection);
377    
378     # add optional components to path
379     $path .= " $path_add" if ($path_add);
380    
381     dump_contents($db, $contents, $mtime, $path);
382     }
383    
384     print STDERR "\n" if ($verbose);
385     # die "zero size content in '$path'" if (! $contents);
386    
387     }
388    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26