/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 69 - (hide annotations)
Fri Jan 25 07:27:30 2002 UTC (22 years, 4 months ago) by laperla
File size: 2857 byte(s)
- Produced the first index that worked with 5.7.2@14354

1 ulpfr 54 #!/usr/bin/perl
2     # -*- Mode: Perl -*-
3     # $Basename: HTML.pm $
4 laperla 69 # $Revision: 1.5 $
5 ulpfr 54 # Author : Ulrich Pfeifer with Andreas König
6     # Created On : Sat Nov 1 1997
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 61 # Last Modified On: Fri Jan 4 16:06:14 2002
9 ulpfr 54 # Language : CPerl
10 ulpfr 61 # Update Count : 14
11 ulpfr 54 # Status : Unknown, Use with caution!
12     #
13     # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14     #
15     #
16    
17     package WAIT::Parse::Ora;
18 ulpfr 58 use HTML::Parser;
19 laperla 69 use Encode;
20 ulpfr 58 use strict;
21 ulpfr 54 use vars qw(@ISA);
22     @ISA = qw(WAIT::Parse::Base);
23    
24 ulpfr 58 my $debug = 0;
25     my %text = (
26     p => 'text',
27     # h1 => 'text',
28     # h2 => 'text',
29     # h3 => 'text',
30     title => 'title',
31     );
32 ulpfr 54
33 ulpfr 58 my $p = HTML::Parser->new(
34     api_version => 3,
35     start_h => [\&handle_start, "tagname, attr"],
36     end_h => [\&handle_end, "tagname"],
37     text_h => [\&handle_text, "dtext"],
38     marked_sections => 1,
39     );
40     my %result;
41     my $text;
42     my $open;
43    
44     sub handle_start {
45     my $tag = shift;
46    
47     return unless $text{$tag};
48     $open++;
49     print ">" x $open, $tag, "\n" if $debug;
50     }
51    
52     sub handle_end {
53     my $tag = shift;
54    
55     return unless $text{$tag};
56     print "<" x $open, $tag, "\n" if $debug;
57     $open--;
58     $text =~ s/^\s+//;
59     $text =~ s/\s+$//;
60     $text =~ s/\s+/ /g;
61     $result{$text{$tag}} .= $text . ' ';
62     $text = '';
63     }
64    
65    
66     sub handle_text {
67     $text .= $_[0] if $open;
68     }
69    
70 laperla 69 sub my_parse ($) {
71     my($s) = @_;
72     my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
73     # LATIN for entities
74     # and we would get
75     # mixed content in
76     # result
77     $p->parse($ls);
78     $p->eof;
79     }
80    
81 ulpfr 54 sub split {
82     my ($self, $doc) = @_;
83 laperla 65 my %doc = ( isbn => '', author => '', about => '', colophon => '' );
84 ulpfr 54 my $desc = $doc->{desc};
85     my $auth = $doc->{author};
86 laperla 65 my $colophon = $doc->{colophon};
87 ulpfr 54
88 ulpfr 61 if ($doc->{author}) {
89     %result = ();
90     $text = '';
91     $open = 0;
92 laperla 69 my_parse($doc->{author});
93 ulpfr 61 $doc{author} = $result{title};
94 laperla 65 $doc{author} =~ s/^By\s+//;
95 ulpfr 61 $doc{about} = $result{text};
96     }
97     if ($doc->{index}) {
98 laperla 65 $doc->{index} =~ /ISBN\s*([^<]+)/ and $doc{isbn} = $1;
99 ulpfr 61 }
100 laperla 65 if ($doc->{colophon}) {
101     %result = ();
102     $text = '';
103     $open = 0;
104 laperla 69 my_parse($doc->{colophon});
105 laperla 65 $doc{colophon} = $result{text};
106     }
107 ulpfr 58 %result = ();
108     $text = '';
109     $open = 0;
110 ulpfr 54
111 laperla 69 my_parse($doc->{desc});
112 ulpfr 54
113 ulpfr 58 $doc{text} = $result{text};
114     $doc{title} = $result{title};
115    
116 laperla 69 while (my($k,$v) = each %doc) {
117     my $utf8v = Encode::decode("ISO-8859-1",$v);
118     $doc{$k} = $utf8v;
119     }
120    
121 ulpfr 58 return \%doc;
122 ulpfr 54 }
123 ulpfr 58
124     1;

Properties

Name Value
cvs2svn:cvs-rev 1.5

  ViewVC Help
Powered by ViewVC 1.1.26