/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 178 - (hide annotations)
Tue Aug 15 16:38:06 2006 UTC (17 years, 7 months ago) by dpavlin
File size: 6292 byte(s)
sync master at end of indexing
1 dpavlin 64 #!/usr/bin/perl -w
2     use strict;
3     use File::Find;
4     use Getopt::Long;
5     use File::Which;
6     use Search::Estraier;
7     use Text::Iconv;
8     #use File::MMagic;
9     use File::MMagic::XS qw/:compat/;
10 dpavlin 118 use Time::HiRes qw/time/;
11 dpavlin 64
12     my $collection; # name which will be inserted
13     my $path_add; # add additional info in path
14     my $verbose;
15     my $exclude;
16    
17     #$verbose = 1;
18     my $debug = 0;
19     my $force = 0;
20 dpavlin 119 my $all = 0;
21 dpavlin 64
22     my $result = GetOptions(
23     "collection=s" => \$collection,
24     "path=s" => \$path_add,
25     "verbose!" => \$verbose,
26     "debug!" => \$debug,
27     "exclude=s" => \$exclude,
28     "force!" => \$force,
29 dpavlin 119 "all!" => \$all,
30 dpavlin 64 );
31    
32 dpavlin 89 my ($node_url,$dir) = @ARGV;
33 dpavlin 64
34 dpavlin 89 die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
35     usage: $0 http://localhost:1978/node/my_dir /path/to/directory
36    
37     options:
38     --collection="name of collection"
39     --path=/path/to/add/at/end
40     --exclude=regex_to_exclude
41     --verbose
42     --force
43     --debug
44 dpavlin 119 --all save placeholders for all files
45 dpavlin 89 _END_OF_USAGE_
46    
47 dpavlin 64 if (! -e $dir) {
48     warn "directory $dir doesn't exist, skipping\n";
49     exit 1;
50     }
51    
52     #my $basedir = $0;
53     #$basedir =~ s,/[^/]+$,/,;
54     #require "$basedir/filter.pm";
55    
56 dpavlin 118 my $docs = 0;
57     my $start_t = time();
58    
59 dpavlin 90 my $filter;
60     foreach my $f (qw/pdftotext pstotext/) {
61     my $w = which($f);
62     if ($f) {
63     $filter->{$f} = $w;
64     print STDERR "using $f filter at $w\n" if ($verbose);
65     }
66     }
67 dpavlin 64
68     #my $mm = new File::MMagic('/usr/share/misc/file/magic');
69     my $mm = new File::MMagic::XS();
70    
71     my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
72    
73     select(STDERR); $|=1;
74     select(STDOUT); $|=1;
75    
76 dpavlin 119 my $db = new Search::Estraier::Node(
77     url => $node_url,
78     user => 'admin',
79     passwd => 'admin',
80     croak_on_error => 1,
81     );
82 dpavlin 64
83     find({ wanted => \&file,
84     follow => 1,
85     follow_skip => 2,
86     no_chdir => 1,
87     }, $dir);
88    
89 dpavlin 118 my $dur = (time() - $start_t) || 1;
90     printf STDERR "%d documents in %.2fs [%.2f docs/s]\n", $docs, $dur, ($docs / $dur);
91 dpavlin 64
92 dpavlin 178 $db->master(
93     action => 'sync'
94     );
95    
96    
97 dpavlin 64 exit;
98    
99 dpavlin 90 sub dump_contents {
100     my ($db,$contents,$mtime,$path,$size) = @_;
101 dpavlin 64
102 dpavlin 90 return unless (defined($contents)); # don't die on empty files
103 dpavlin 64
104     if ($exclude && $path =~ m/$exclude/i) {
105     print STDERR "skip: $path\n" if ($verbose);
106     return;
107     }
108    
109     use bytes;
110 dpavlin 90 if (! $size) {
111     $size = length $contents;
112     }
113 dpavlin 64
114     print STDERR " [$size]" if ($verbose);
115    
116     # create a document object
117 dpavlin 89 my $doc = new Search::Estraier::Document;
118 dpavlin 64
119     my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
120    
121     # chop long titles to 100 chars
122     $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
123     # use path if no title is found
124     $title ||= $path;
125    
126     # add attributes to the document object
127     $doc->add_attr('@uri', "file:///$path");
128     $doc->add_attr('@title', $iconv->convert($title));
129     $doc->add_attr('@size', $size);
130     $doc->add_attr('@mtime', $mtime);
131    
132 dpavlin 90 if ($contents) {
133     # html2text
134     $contents =~ s#<[^>]+/*>##gs;
135     $contents =~ s#\s\s+# #gs;
136 dpavlin 64
137 dpavlin 90 $doc->add_text($iconv->convert($contents));
138     }
139     # store path
140     $doc->add_hidden_text($path);
141     # boost title
142     $doc->add_hidden_text($title);
143 dpavlin 64
144 dpavlin 120 print $doc->dump_draft if ($debug);
145 dpavlin 64
146     # register the document object to the database
147 dpavlin 89 $db->put_doc($doc);
148 dpavlin 64
149 dpavlin 118 $docs++;
150    
151 dpavlin 64 }
152    
153 dpavlin 90 sub filter_to_pages {
154     my ($path, $mtime, $command) = @_;
155 dpavlin 64
156     print STDERR "$path {converting}" if ($verbose);
157    
158 dpavlin 90 open(F,"$command |") || die "can't open $command with '$path': $!";
159 dpavlin 64 my $html;
160     while(<F>) {
161     $html .= $_;
162     }
163     close(F);
164    
165     return if (! $html);
166    
167     my $file_only = $path;
168     $file_only =~ s/^.*\/([^\/]+)$/$1/g;
169    
170     my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
171    
172     ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
173    
174     if ($collection) {
175     $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
176     } else {
177     $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
178     $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
179     }
180    
181     # save empty entry as a placeholder
182     dump_contents($db, ' ', $mtime, "$path");
183    
184     my $page_nr = 1;
185     foreach my $page (split(/\f/s,$pages)) {
186     print STDERR " $page_nr" if ($verbose);
187     my $pre_tmp = $pre_html;
188     $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
189     dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
190     $page_nr++;
191     }
192    
193 dpavlin 90
194    
195     }
196    
197     sub file {
198    
199     my $path = $_;
200     my $contents;
201    
202 dpavlin 119 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak|.gif)$/);
203 dpavlin 121 return if (! $all && -d $path);
204 dpavlin 90
205     my $mtime = (stat($path))[9] || -1;
206 dpavlin 119 my $mtime_db = eval { $db->get_doc_attr_by_uri("file:///$path", '@mtime') };
207     $mtime_db ||= -2;
208 dpavlin 90
209     if ($mtime == $mtime_db) {
210     print STDERR "# same: $path $mtime\n" if ($verbose);
211     return unless($force);
212 dpavlin 64 } else {
213 dpavlin 90 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
214     }
215 dpavlin 64
216 dpavlin 90 # skip files on which File::MMagic::XS croaks
217     if ($path =~ m#\.au$#) {
218     warn "skipped '$path' to prevent File::MMagic::XS bug\n" if ($debug);
219     return;
220     }
221    
222     my $type = $mm->checktype_filename($path);
223     $type =~ s/\s+/ /gs;
224    
225     print STDERR "# $path $type\n" if ($debug);
226    
227     if ($type =~ m/pdf/i) {
228     if ($filter->{pdftotext}) {
229     filter_to_pages($path, $mtime, qq( $filter->{pdftotext} -htmlmeta "$path" - ));
230     } else {
231     warn "skipping '$path', no pdftotext filter\n" if ($verbose);
232     return;
233     }
234     } elsif ($type eq 'application/postscript') {
235     if ($filter->{pstotext}) {
236     filter_to_pages($path, $mtime, qq( $filter->{pstotext} "$path" ));
237     } else {
238     warn "skipping '$path', no pstotext filter\n" if ($verbose);
239     return;
240     }
241     } else {
242    
243 dpavlin 64 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
244 dpavlin 119 if (-f $path &&
245     $type !~ m/html/ &&
246     $path !~ m/\.(php|pl|txt|info|log|text)$/io
247 dpavlin 77 ) {
248 dpavlin 119 dump_contents($db, '', $mtime, $path, -s $path) if ($all);
249 dpavlin 90 return;
250 dpavlin 77 }
251 dpavlin 64
252     # skip index files
253 dpavlin 119 return if ($path =~ m/index_(?:[a-z]+|symbol)\.html*/i);
254 dpavlin 64
255     open(F,"$path") || die "can't open file: $path";
256     print STDERR "$path ($type)" if ($verbose);
257     while(<F>) {
258     $contents .= "$_";
259     }
260     $contents .= "\n\n";
261    
262     #$contents = filter($contents,$collection);
263    
264     # add optional components to path
265     $path .= " $path_add" if ($path_add);
266    
267     dump_contents($db, $contents, $mtime, $path);
268     }
269    
270     print STDERR "\n" if ($verbose);
271     # die "zero size content in '$path'" if (! $contents);
272    
273     }
274    

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26