/[wait]/trunk/script/index_html
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/script/index_html

Parent Directory Parent Directory | Revision Log Revision Log


Revision 118 - (hide annotations)
Fri Jul 15 18:59:10 2005 UTC (18 years, 10 months ago) by dpavlin
File size: 4964 byte(s)
some rather old changes from 2004-05-28

1 dpavlin 118 #!/usr/bin/perl -w
2 ulpfr 17
3     use strict;
4    
5     use Getopt::Long;
6 dpavlin 118 use Digest::MD5 qw(md5_hex);
7     use Data::Dumper;
8 ulpfr 17
9     require WAIT::Database;
10     require WAIT::Config;
11     require WAIT::Parse::HTML;
12     require WAIT::Document::Find;
13    
14 dpavlin 118 use utf8;
15 ulpfr 17
16 dpavlin 118 use lib '/data/wait/lib';
17    
18 ulpfr 17 my %OPT = (database => 'DB',
19     dir => $WAIT::Config->{WAIT_home} || '/tmp',
20 dpavlin 118 table => 'html',
21 ulpfr 17 clean => 0,
22     remove => 0,
23 dpavlin 118 force => 0,
24 ulpfr 17 );
25    
26     GetOptions(\%OPT,
27     'database=s',
28     'dir=s',
29     'table=s',
30     'clean!',
31 dpavlin 118 'remove!',
32     'force!',
33     );
34    
35     my $path = shift @ARGV ||
36     die "Usage: $0 [-database=$OPT{database}] [-dir=$OPT{dir}] [-table=$OPT{table}]\n\t[-clean] [-remove] [-force] directory_with_htmls\n";
37 ulpfr 17
38 dpavlin 118 if ($OPT{clean}) {
39     my $db = WAIT::Database->open(
40     name => $OPT{database},
41     'directory' => $OPT{dir},
42     )
43     or die "Could not open database '$OPT{dir}/$OPT{database}': $@";
44     $db->drop_table(name => $OPT{table}) or
45     die "Could not drop table '$OPT{tabel}': $@";
46    
47     $db->close;
48 ulpfr 17 }
49    
50 dpavlin 118 my $db = WAIT::Database->open(
51     name => $OPT{database},
52     'directory' => $OPT{dir},
53     )
54     || WAIT::Database->create(
55     name => $OPT{database},
56     'directory' => $OPT{dir},
57     )
58     or die "Could not open/create database '$OPT{dir}/$OPT{database}': $@";
59    
60 ulpfr 17 my $layout= new WAIT::Parse::HTML;
61     my $stem = [{
62     'prefix' => ['isotr', 'isolc'],
63     'intervall' => ['isotr', 'isolc'],
64     },'decode_entities', 'isotr', 'isolc', 'split2', 'stop', 'Stem'];
65     my $text = [{
66     'prefix' => ['isotr', 'isolc'],
67     'intervall' => ['isotr', 'isolc'],
68     },
69     'decode_entities', 'isotr', 'isolc', 'split2', 'stop'];
70     my $sound = ['decode_entities', 'isotr', 'isolc', 'split2', 'Soundex'];
71    
72     my %D;
73    
74     my $access = tie (%D, 'WAIT::Document::Find', sub { $_[0] =~ /\.htm/; },
75 dpavlin 118 $path);
76 ulpfr 17 die $@ unless defined $access;
77    
78    
79     my $tb = $db->table(name => $OPT{table}) ||
80     $db->create_table
81     (name => $OPT{table},
82 dpavlin 118 attr => ['docid', 'headline', 'size', 'md5'],
83     keyset => [['docid', 'md5']],
84 ulpfr 17 layout => $layout,
85     access => $access,
86     invindex =>
87     [
88     'text' => $stem,
89     'title' => $stem,
90     'title' => $text,
91     ]
92     );
93     die unless $tb;
94    
95     while (my ($path, $content) = each %D) {
96     &index($path, $content);
97     }
98     $db->close();
99     exit;
100    
101     my $NO;
102     sub index {
103     my ($did, $value) = @_;
104     if ($tb->have('docid' => $did)) {
105 dpavlin 118 if (!$OPT{remove} && !$OPT{force}) {
106 ulpfr 17 print "duplicate\n";
107     return;
108     }
109     } elsif ($OPT{remove}) {
110     print "missing\n";
111     return;
112     }
113    
114     if (-s $did < 100) {
115     print "too small\n";
116     return;
117     }
118    
119     unless (defined $value) {
120     print "unavailable\n";
121     return;
122     }
123     printf STDERR "ok [%d]\n", ++$NO;
124    
125     my $record = $layout->split($value);
126     $record->{size} = length($value);
127     my $headline = $record->{title} || $did;
128     $headline =~ s/\s+/ /g; $headline =~ s/^\s+//;
129     printf "%s\n", substr($headline,0,80);
130 dpavlin 118 my $t = \$record->{text};
131     if ($$t) {
132     my $md5;
133     if (utf8::is_utf8($$t)) {
134     $md5 = Dumper($$t)
135     } else {
136     $md5 = md5_hex($$t);
137     }
138     if ($tb->have('md5' => $md5)) {
139     print "duplicate md5\n";
140     return;
141     }
142     $record->{md5} = $md5;
143     print "$md5\n";
144     } else {
145     print "no page content! skipping...\n";
146     return;
147     }
148    
149 ulpfr 17 if ($OPT{remove}) {
150     $tb->delete('docid' => $did, headline => $headline, %{$record});
151     } else {
152     $tb->insert('docid' => $did, headline => $headline, %{$record});
153     }
154     }
155    
156 dpavlin 118 $WAIT::Config = $WAIT::Config;
157 ulpfr 17
158     __END__
159     ## ###################################################################
160     ## pod
161     ## ###################################################################
162    
163     =head1 NAME
164    
165 dpavlin 118 index_html - generate a html database for sman
166 ulpfr 17
167     =head1 SYNOPSIS
168    
169     B<index_html>
170     [B<-database> I<database name>]
171     [B<-dir> I<database directory>]
172     [B<-table> I<name>]
173     [B<-remove>]
174 dpavlin 118 [I<htmldir> ...]
175 ulpfr 17
176     =head1 DESCRIPTION
177    
178     B<Index_html> generates/updates databases for B<sman>(1). If
179 dpavlin 118 I<htmldir>s are specified, these are used. Otherwise the script dies.
180 ulpfr 17
181     =head2 OPTIONS
182    
183     =over 10
184    
185     =item B<-database> I<database name>
186    
187     Change the default database name to I<database name>.
188    
189     =item B<-dir> I<database directory>
190    
191     Change the default database directory to I<database directory>.
192    
193     =item B<-table> I<name>
194    
195     Use I<name> instead of C<man> as table name.
196    
197     =item B<-clean>
198    
199     Clean B<database> before indexing.
200    
201     =item B<-remove>
202    
203     Remove the selected directories from the database instead of
204     adding/updating. This works only for the manuals which are unchanged
205     since the indexing.
206    
207 dpavlin 109 =back
208    
209 ulpfr 17 =head1 SEE ALSO
210    
211     L<sman>.
212    
213     =head1 AUTHOR
214    
215     Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>

Properties

Name Value
cvs2svn:cvs-rev 1.1
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26