/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 120 - (hide annotations)
Mon Apr 17 10:34:14 2006 UTC (18 years ago) by dpavlin
File size: 6223 byte(s)
dump_draft is now triggered by --debug not --verbose
1 dpavlin 64 #!/usr/bin/perl -w
2     use strict;
3     use File::Find;
4     use Getopt::Long;
5     use File::Which;
6     use Search::Estraier;
7     use Text::Iconv;
8     #use File::MMagic;
9     use File::MMagic::XS qw/:compat/;
10 dpavlin 118 use Time::HiRes qw/time/;
11 dpavlin 64
12     my $collection; # name which will be inserted
13     my $path_add; # add additional info in path
14     my $verbose;
15     my $exclude;
16    
17     #$verbose = 1;
18     my $debug = 0;
19     my $force = 0;
20 dpavlin 119 my $all = 0;
21 dpavlin 64
22     my $result = GetOptions(
23     "collection=s" => \$collection,
24     "path=s" => \$path_add,
25     "verbose!" => \$verbose,
26     "debug!" => \$debug,
27     "exclude=s" => \$exclude,
28     "force!" => \$force,
29 dpavlin 119 "all!" => \$all,
30 dpavlin 64 );
31    
32 dpavlin 89 my ($node_url,$dir) = @ARGV;
33 dpavlin 64
34 dpavlin 89 die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
35     usage: $0 http://localhost:1978/node/my_dir /path/to/directory
36    
37     options:
38     --collection="name of collection"
39     --path=/path/to/add/at/end
40     --exclude=regex_to_exclude
41     --verbose
42     --force
43     --debug
44 dpavlin 119 --all save placeholders for all files
45 dpavlin 89 _END_OF_USAGE_
46    
47 dpavlin 64 if (! -e $dir) {
48     warn "directory $dir doesn't exist, skipping\n";
49     exit 1;
50     }
51    
52     #my $basedir = $0;
53     #$basedir =~ s,/[^/]+$,/,;
54     #require "$basedir/filter.pm";
55    
56 dpavlin 118 my $docs = 0;
57     my $start_t = time();
58    
59 dpavlin 90 my $filter;
60     foreach my $f (qw/pdftotext pstotext/) {
61     my $w = which($f);
62     if ($f) {
63     $filter->{$f} = $w;
64     print STDERR "using $f filter at $w\n" if ($verbose);
65     }
66     }
67 dpavlin 64
68     #my $mm = new File::MMagic('/usr/share/misc/file/magic');
69     my $mm = new File::MMagic::XS();
70    
71     my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
72    
73     select(STDERR); $|=1;
74     select(STDOUT); $|=1;
75    
76 dpavlin 119 my $db = new Search::Estraier::Node(
77     url => $node_url,
78     user => 'admin',
79     passwd => 'admin',
80     croak_on_error => 1,
81     );
82 dpavlin 64
83     find({ wanted => \&file,
84     follow => 1,
85     follow_skip => 2,
86     no_chdir => 1,
87     }, $dir);
88    
89 dpavlin 118 my $dur = (time() - $start_t) || 1;
90     printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur);
91 dpavlin 64
92     exit;
93    
94 dpavlin 90 sub dump_contents {
95     my ($db,$contents,$mtime,$path,$size) = @_;
96 dpavlin 64
97 dpavlin 90 return unless (defined($contents)); # don't die on empty files
98 dpavlin 64
99     if ($exclude && $path =~ m/$exclude/i) {
100     print STDERR "skip: $path\n" if ($verbose);
101     return;
102     }
103    
104     use bytes;
105 dpavlin 90 if (! $size) {
106     $size = length $contents;
107     }
108 dpavlin 64
109     print STDERR " [$size]" if ($verbose);
110    
111     # create a document object
112 dpavlin 89 my $doc = new Search::Estraier::Document;
113 dpavlin 64
114     my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
115    
116     # chop long titles to 100 chars
117     $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
118     # use path if no title is found
119     $title ||= $path;
120    
121     # add attributes to the document object
122     $doc->add_attr('@uri', "file:///$path");
123     $doc->add_attr('@title', $iconv->convert($title));
124     $doc->add_attr('@size', $size);
125     $doc->add_attr('@mtime', $mtime);
126    
127 dpavlin 90 if ($contents) {
128     # html2text
129     $contents =~ s#<[^>]+/*>##gs;
130     $contents =~ s#\s\s+# #gs;
131 dpavlin 64
132 dpavlin 90 $doc->add_text($iconv->convert($contents));
133     }
134     # store path
135     $doc->add_hidden_text($path);
136     # boost title
137     $doc->add_hidden_text($title);
138 dpavlin 64
139 dpavlin 120 print $doc->dump_draft if ($debug);
140 dpavlin 64
141     # register the document object to the database
142 dpavlin 89 $db->put_doc($doc);
143 dpavlin 64
144 dpavlin 118 $docs++;
145    
146 dpavlin 64 }
147    
148 dpavlin 90 sub filter_to_pages {
149     my ($path, $mtime, $command) = @_;
150 dpavlin 64
151     print STDERR "$path {converting}" if ($verbose);
152    
153 dpavlin 90 open(F,"$command |") || die "can't open $command with '$path': $!";
154 dpavlin 64 my $html;
155     while(<F>) {
156     $html .= $_;
157     }
158     close(F);
159    
160     return if (! $html);
161    
162     my $file_only = $path;
163     $file_only =~ s/^.*\/([^\/]+)$/$1/g;
164    
165     my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
166    
167     ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
168    
169     if ($collection) {
170     $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
171     } else {
172     $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
173     $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
174     }
175    
176     # save empty entry as a placeholder
177     dump_contents($db, ' ', $mtime, "$path");
178    
179     my $page_nr = 1;
180     foreach my $page (split(/\f/s,$pages)) {
181     print STDERR " $page_nr" if ($verbose);
182     my $pre_tmp = $pre_html;
183     $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
184     dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
185     $page_nr++;
186     }
187    
188 dpavlin 90
189    
190     }
191    
192     sub file {
193    
194     my $path = $_;
195     my $contents;
196    
197 dpavlin 119 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/);
198 dpavlin 90
199     my $mtime = (stat($path))[9] || -1;
200 dpavlin 119 my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') };
201     $mtime_db ||= -2;
202 dpavlin 90
203     if ($mtime == $mtime_db) {
204     print STDERR "# same: $path $mtime\n" if ($verbose);
205     return unless($force);
206 dpavlin 64 } else {
207 dpavlin 90 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
208     }
209 dpavlin 64
210 dpavlin 90 # skip files on which File::MMagic::XS croaks
211     if ($path =~ m#\.au$#) {
212     warn "skipped '$path' to prevent File::MMagic::XS bug\n" if ($debug);
213     return;
214     }
215    
216     my $type = $mm->checktype_filename($path);
217     $type =~ s/\s+/ /gs;
218    
219     print STDERR "# $path $type\n" if ($debug);
220    
221     if ($type =~ m/pdf/i) {
222     if ($filter->{pdftotext}) {
223     filter_to_pages($path, $mtime, qq( $filter->{pdftotext} -htmlmeta "$path" - ));
224     } else {
225     warn "skipping '$path', no pdftotext filter\n" if ($verbose);
226     return;
227     }
228     } elsif ($type eq 'application/postscript') {
229     if ($filter->{pstotext}) {
230     filter_to_pages($path, $mtime, qq( $filter->{pstotext} "$path" ));
231     } else {
232     warn "skipping '$path', no pstotext filter\n" if ($verbose);
233     return;
234     }
235     } else {
236    
237 dpavlin 64 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
238 dpavlin 119 if (-f $path &&
239     $type !~ m/html/ &&
240     $path !~ m/\.(php|pl|txt|info|log|text)$/io
241 dpavlin 77 ) {
242 dpavlin 119 dump_contents($db, '', $mtime, $path, -s $path) if ($all);
243 dpavlin 90 return;
244 dpavlin 77 }
245 dpavlin 64
246     # skip index files
247 dpavlin 119 return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i);
248 dpavlin 64
249     open(F,"$path") || die "can't open file: $path";
250     print STDERR "$path ($type)" if ($verbose);
251     while(<F>) {
252     $contents .= "$_";
253     }
254     $contents .= "\n\n";
255    
256     #$contents = filter($contents,$collection);
257    
258     # add optional components to path
259     $path .= " $path_add" if ($path_add);
260    
261     dump_contents($db, $contents, $mtime, $path);
262     }
263    
264     print STDERR "\n" if ($verbose);
265     # die "zero size content in '$path'" if (! $contents);
266    
267     }
268    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26