/[hyperestraier_wrappers]/trunk/perl/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/perl/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log


Revision 30 - (show annotations)
Sun Sep 18 18:21:06 2005 UTC (18 years, 7 months ago) by dpavlin
File size: 5437 byte(s)
split pdf into pages (using #page in uri)
1 #!/usr/bin/perl -w
2 use strict;
3 use File::Find;
4 use Getopt::Long;
5 use File::Which;
6 use HyperEstraier;
7 use Text::Iconv;
8 #use File::MMagic;
9 use File::MMagic::XS qw/:compat/;
10
11 # do we use Node API?
12 my $node_url;
13
14 my $collection; # name which will be inserted
15 my $path_add; # add additional info in path
16 my $verbose;
17 my $exclude;
18
19 #$verbose = 1;
20 my $debug = 0;
21 my $force = 0;
22
23 my $result = GetOptions(
24 "collection=s" => \$collection,
25 "path=s" => \$path_add,
26 "verbose!" => \$verbose,
27 "debug!" => \$debug,
28 "exclude=s" => \$exclude,
29 "node=s" => \$node_url,
30 "force!" => \$force,
31 );
32
33 my $dir = shift @ARGV || die "usage: $0 [dir]";
34
35 #my $basedir = $0;
36 #$basedir =~ s,/[^/]+$,/,;
37 #require "$basedir/filter.pm";
38
39 my $pdftotext = which('pdftotext');
40
41 #my $mm = new File::MMagic('/usr/share/misc/file/magic');
42 my $mm = new File::MMagic::XS();
43
44 my $iconv = new Text::Iconv('iso-8859-2', 'utf-8');
45
46 select(STDERR); $|=1;
47 select(STDOUT); $|=1;
48
49 print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
50
51 my $db;
52 if ($node_url) {
53 $db = HyperEstraier::Node->new($node_url);
54 $db->set_auth('admin', 'admin');
55 } else {
56 # open the database
57 $db = HyperEstraier::Database->new();
58 $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);
59
60 sub signal {
61 my($sig) = @_;
62 print "\nCaught a SIG$sig--syncing database and shutting down\n";
63 $db->sync();
64 exit(0);
65 }
66
67 $SIG{'INT'} = \&signal;
68 $SIG{'QUIT'} = \&signal;
69 }
70
71 find({ wanted => \&file,
72 follow => 1,
73 follow_skip => 2,
74 no_chdir => 1,
75 }, $dir);
76
77 unless ($node_url) {
78 print "--- sync\n";
79 $db->sync();
80
81 print "--- optimize...\n";
82 $db->optimize(0);
83 }
84 exit;
85
86 sub dump_contents($$$$) {
87 my ($db,$contents,$mtime,$path) = @_;
88
89 return unless ($contents); # don't die on empty files
90
91 if ($exclude && $path =~ m/$exclude/i) {
92 print STDERR "skip: $path\n" if ($verbose);
93 return;
94 }
95
96 use bytes;
97 my $size = length $contents;
98
99 print STDERR " [$size]" if ($verbose);
100
101 # create a document object
102 my $doc = HyperEstraier::Document->new;
103
104 my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
105
106 # chop long titles to 100 chars
107 $title = substr($title, 0, 100) . '...' if ($title && length($title) > 100);
108 # use path if no title is found
109 $title ||= $path;
110
111 # add attributes to the document object
112 $doc->add_attr('@uri', "file:///$path");
113 $doc->add_attr('@title', $iconv->convert($title));
114 $doc->add_attr('@size', $size);
115 $doc->add_attr('@mtime', $mtime);
116
117 # html2text
118 $contents =~ s#<[^>]+/*>##gs;
119 $contents =~ s#\s\s+# #gs;
120
121 $doc->add_text($iconv->convert($contents));
122
123 # print $doc->dump_draft if ($verbose);
124
125 # register the document object to the database
126 if ($node_url) {
127 $db->put_doc($doc);
128 } else {
129 $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);
130 }
131
132 }
133
134 sub file {
135
136 my $path = $_;
137 my $contents;
138
139 return if (! $force && -l $path || $path =~ m#/.svn# || $path =~ m/(~|.bak)$/);
140
141 my $mtime = (stat($path))[9];
142 my $mtime_db = $db->get_doc_attr_by_uri("file:///$path", '@mtime') || -2;
143
144 if ($mtime == $mtime_db) {
145 print STDERR "# same: $path $mtime\n" if ($verbose);
146 return unless($force);
147 } else {
148 print STDERR "# changed: $path $mtime != $mtime_db\n" if ($debug);
149 }
150
151 # skip files on which File::MMagic::XS croaks
152 return if ($path =~ m#\.au$#);
153
154 my $type = $mm->checktype_filename($path);
155 $type =~ s/\s+/ /gs;
156
157 print STDERR "# $path $type\n" if ($debug);
158
159 if ($pdftotext && -f $path && $type =~ m/pdf/i) {
160
161 print STDERR "$path {converting}" if ($verbose);
162
163 open(F,"$pdftotext -htmlmeta \"$path\" - |") || die "can't open $pdftotext with '$path'";
164 my $html;
165 while(<F>) {
166 # XXX why pdftotext barks if I try to use this is beyond me.
167 #$contents .= $_;
168
169 $html .= $_;
170 }
171 close(F);
172
173 return if (! $html);
174
175 my $file_only = $path;
176 $file_only =~ s/^.*\/([^\/]+)$/$1/g;
177
178 my ($pre_html,$pages,$post_html) = ('<html><head><title>$path :: page ##page_nr##</title></head><body><pre>',$html,'</pre></body></html>');
179
180 ($pre_html,$pages,$post_html) = ($1,$2,$3) if ($html =~ m/^(<html>.+?<pre>)(.+)(<\/pre>.+?)$/si);
181
182 if ($collection) {
183 $pre_html =~ s/<title>(.+?)<\/title>/<title>$collection :: page ##page_nr##<\/title>/si;
184 } else {
185 $pre_html =~ s/<title>(.+?)<\/title>/<title>$1 :: page ##page_nr##<\/title>/si ||
186 $pre_html =~ s/<title><\/title>/<title>$file_only :: page ##page_nr##<\/title>/si;
187 }
188
189 my $page_nr = 1;
190 foreach my $page (split(/\f/s,$pages)) {
191 print STDERR " $page_nr" if ($verbose);
192 my $pre_tmp = $pre_html;
193 $pre_tmp =~ s/##page_nr##/$page_nr<\/title>/s;
194 dump_contents($db, $pre_tmp . $page . $post_html, $mtime, "$path#$page_nr") if ($page !~ m/^\s*$/s);
195 $page_nr++;
196 }
197
198 } else {
199
200 # return if (! -f $path || ! m/\.(html*|php|pl|txt|info|log|text)$/i);
201 return unless (-f $path && $type =~ m/html/ ||
202 ($type =~ m#text# && $path =~ m/\.(php|pl|txt|info|log|text)$/io)
203 );
204
205 # skip index files
206 return if (m/index_[a-z]\.html*/i || m/index_symbol\.html*/i);
207
208 open(F,"$path") || die "can't open file: $path";
209 print STDERR "$path ($type)" if ($verbose);
210 while(<F>) {
211 $contents .= "$_";
212 }
213 $contents .= "\n\n";
214
215 #$contents = filter($contents,$collection);
216
217 # add optional components to path
218 $path .= " $path_add" if ($path_add);
219
220 dump_contents($db, $contents, $mtime, $path);
221 }
222
223 print STDERR "\n" if ($verbose);
224 # die "zero size content in '$path'" if (! $contents);
225
226 }
227

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26