/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 69 - (show annotations)
Fri Jan 25 07:27:30 2002 UTC (22 years, 4 months ago) by laperla
File size: 2857 byte(s)
- Produced the first index that worked with 5.7.2@14354

1 #!/usr/bin/perl
2 # -*- Mode: Perl -*-
3 # $Basename: HTML.pm $
4 # $Revision: 1.5 $
5 # Author : Ulrich Pfeifer with Andreas König
6 # Created On : Sat Nov 1 1997
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 16:06:14 2002
9 # Language : CPerl
10 # Update Count : 14
11 # Status : Unknown, Use with caution!
12 #
13 # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14 #
15 #
16
17 package WAIT::Parse::Ora;
18 use HTML::Parser;
19 use Encode;
20 use strict;
21 use vars qw(@ISA);
22 @ISA = qw(WAIT::Parse::Base);
23
24 my $debug = 0;
25 my %text = (
26 p => 'text',
27 # h1 => 'text',
28 # h2 => 'text',
29 # h3 => 'text',
30 title => 'title',
31 );
32
33 my $p = HTML::Parser->new(
34 api_version => 3,
35 start_h => [\&handle_start, "tagname, attr"],
36 end_h => [\&handle_end, "tagname"],
37 text_h => [\&handle_text, "dtext"],
38 marked_sections => 1,
39 );
40 my %result;
41 my $text;
42 my $open;
43
44 sub handle_start {
45 my $tag = shift;
46
47 return unless $text{$tag};
48 $open++;
49 print ">" x $open, $tag, "\n" if $debug;
50 }
51
52 sub handle_end {
53 my $tag = shift;
54
55 return unless $text{$tag};
56 print "<" x $open, $tag, "\n" if $debug;
57 $open--;
58 $text =~ s/^\s+//;
59 $text =~ s/\s+$//;
60 $text =~ s/\s+/ /g;
61 $result{$text{$tag}} .= $text . ' ';
62 $text = '';
63 }
64
65
66 sub handle_text {
67 $text .= $_[0] if $open;
68 }
69
70 sub my_parse ($) {
71 my($s) = @_;
72 my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
73 # LATIN for entities
74 # and we would get
75 # mixed content in
76 # result
77 $p->parse($ls);
78 $p->eof;
79 }
80
81 sub split {
82 my ($self, $doc) = @_;
83 my %doc = ( isbn => '', author => '', about => '', colophon => '' );
84 my $desc = $doc->{desc};
85 my $auth = $doc->{author};
86 my $colophon = $doc->{colophon};
87
88 if ($doc->{author}) {
89 %result = ();
90 $text = '';
91 $open = 0;
92 my_parse($doc->{author});
93 $doc{author} = $result{title};
94 $doc{author} =~ s/^By\s+//;
95 $doc{about} = $result{text};
96 }
97 if ($doc->{index}) {
98 $doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1;
99 }
100 if ($doc->{colophon}) {
101 %result = ();
102 $text = '';
103 $open = 0;
104 my_parse($doc->{colophon});
105 $doc{colophon} = $result{text};
106 }
107 %result = ();
108 $text = '';
109 $open = 0;
110
111 my_parse($doc->{desc});
112
113 $doc{text} = $result{text};
114 $doc{title} = $result{title};
115
116 while (my($k,$v) = each %doc) {
117 my $utf8v = Encode::decode("ISO-8859-1",$v);
118 $doc{$k} = $utf8v;
119 }
120
121 return \%doc;
122 }
123
124 1;

Properties

Name Value
cvs2svn:cvs-rev 1.5

  ViewVC Help
Powered by ViewVC 1.1.26