/[wait]/trunk/script/index_html
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /trunk/script/index_html

Parent Directory Parent Directory | Revision Log Revision Log


Revision 118 - (show annotations)
Fri Jul 15 18:59:10 2005 UTC (18 years, 9 months ago) by dpavlin
File size: 4964 byte(s)
some rather old changes from 2004-05-28

1 #!/usr/bin/perl -w
2
3 use strict;
4
5 use Getopt::Long;
6 use Digest::MD5 qw(md5_hex);
7 use Data::Dumper;
8
9 require WAIT::Database;
10 require WAIT::Config;
11 require WAIT::Parse::HTML;
12 require WAIT::Document::Find;
13
14 use utf8;
15
16 use lib '/data/wait/lib';
17
18 my %OPT = (database => 'DB',
19 dir => $WAIT::Config->{WAIT_home} || '/tmp',
20 table => 'html',
21 clean => 0,
22 remove => 0,
23 force => 0,
24 );
25
26 GetOptions(\%OPT,
27 'database=s',
28 'dir=s',
29 'table=s',
30 'clean!',
31 'remove!',
32 'force!',
33 );
34
35 my $path = shift @ARGV ||
36 die "Usage: $0 [-database=$OPT{database}] [-dir=$OPT{dir}] [-table=$OPT{table}]\n\t[-clean] [-remove] [-force] directory_with_htmls\n";
37
38 if ($OPT{clean}) {
39 my $db = WAIT::Database->open(
40 name => $OPT{database},
41 'directory' => $OPT{dir},
42 )
43 or die "Could not open database '$OPT{dir}/$OPT{database}': $@";
44 $db->drop_table(name => $OPT{table}) or
45 die "Could not drop table '$OPT{tabel}': $@";
46
47 $db->close;
48 }
49
50 my $db = WAIT::Database->open(
51 name => $OPT{database},
52 'directory' => $OPT{dir},
53 )
54 || WAIT::Database->create(
55 name => $OPT{database},
56 'directory' => $OPT{dir},
57 )
58 or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@";
59
60 my $layout= new WAIT::Parse::HTML;
61 my $stem = [{
62 'prefix' => ['isotr', 'isolc'],
63 'intervall' => ['isotr', 'isolc'],
64 },'decode_entities', 'isotr', 'isolc', 'split2', 'stop', 'Stem'];
65 my $text = [{
66 'prefix' => ['isotr', 'isolc'],
67 'intervall' => ['isotr', 'isolc'],
68 },
69 'decode_entities', 'isotr', 'isolc', 'split2', 'stop'];
70 my $sound = ['decode_entities', 'isotr', 'isolc', 'split2', 'Soundex'];
71
72 my %D;
73
74 my $access = tie (%D, 'WAIT::Document::Find', sub { $_[0] =~ /\.htm/; },
75 $path);
76 die $@ unless defined $access;
77
78
79 my $tb = $db->table(name => $OPT{table}) ||
80 $db->create_table
81 (name => $OPT{table},
82 attr => ['docid', 'headline', 'size', 'md5'],
83 keyset => [['docid', 'md5']],
84 layout => $layout,
85 access => $access,
86 invindex =>
87 [
88 'text' => $stem,
89 'title' => $stem,
90 'title' => $text,
91 ]
92 );
93 die unless $tb;
94
95 while (my ($path, $content) = each %D) {
96 &index($path, $content);
97 }
98 $db->close();
99 exit;
100
101 my $NO;
102 sub index {
103 my ($did, $value) = @_;
104 if ($tb->have('docid' => $did)) {
105 if (!$OPT{remove} && !$OPT{force}) {
106 print "duplicate\n";
107 return;
108 }
109 } elsif ($OPT{remove}) {
110 print "missing\n";
111 return;
112 }
113
114 if (-s $did < 100) {
115 print "too small\n";
116 return;
117 }
118
119 unless (defined $value) {
120 print "unavailable\n";
121 return;
122 }
123 printf STDERR "ok [%d]\n", ++$NO;
124
125 my $record = $layout->split($value);
126 $record->{size} = length($value);
127 my $headline = $record->{title} || $did;
128 $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
129 printf "%s\n", substr($headline,0,80);
130 my $t = \$record->{text};
131 if ($$t) {
132 my $md5;
133 if (utf8::is_utf8($$t)) {
134 $md5 = Dumper($$t)
135 } else {
136 $md5 = md5_hex($$t);
137 }
138 if ($tb->have('md5' => $md5)) {
139 print "duplicate md5\n";
140 return;
141 }
142 $record->{md5} = $md5;
143 print "$md5\n";
144 } else {
145 print "no page content! skipping...\n";
146 return;
147 }
148
149 if ($OPT{remove}) {
150 $tb->delete('docid' => $did, headline => $headline, %{$record});
151 } else {
152 $tb->insert('docid' => $did, headline => $headline, %{$record});
153 }
154 }
155
156 $WAIT::Config = $WAIT::Config;
157
158 __END__
159 ## ###################################################################
160 ## pod
161 ## ###################################################################
162
163 =head1 NAME
164
165 index_html - generate a html database for sman
166
167 =head1 SYNOPSIS
168
169 B<index_html>
170 [B<-database> I<database name>]
171 [B<-dir> I<database directory>]
172 [B<-table> I<name>]
173 [B<-remove>]
174 [I<htmldir> ...]
175
176 =head1 DESCRIPTION
177
178 B<Index_html> generates/updates databases for B<sman>(1). If
179 I<htmldir>s are specified, these are used. Otherwise the script dies.
180
181 =head2 OPTIONS
182
183 =over 10
184
185 =item B<-database> I<database name>
186
187 Change the default database name to I<database name>.
188
189 =item B<-dir> I<database directory>
190
191 Change the default database directory to I<database directory>.
192
193 =item B<-table> I<name>
194
195 Use I<name> instead of C<man> as table name.
196
197 =item B<-clean>
198
199 Clean B<database> before indexing.
200
201 =item B<-remove>
202
203 Remove the selected directories from the database instead of
204 adding/updating. This works only for the manuals which are unchanged
205 since the indexing.
206
207 =back
208
209 =head1 SEE ALSO
210
211 L<sman>.
212
213 =head1 AUTHOR
214
215 Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Properties

Name Value
cvs2svn:cvs-rev 1.1
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26