/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (show annotations)
Tue Mar 5 13:40:38 2002 UTC (22 years, 3 months ago) by laperla
File size: 4023 byte(s)
- Just a snapshot before we rewrite the indexer since the text is
  marked up better today.

1 #!/usr/bin/perl
2 # -*- Mode: Perl -*-
3 # $Basename: HTML.pm $
4 # $Revision: 1.7 $
5 # Author : Ulrich Pfeifer with Andreas König
6 # Created On : Sat Nov 1 1997
7 # Last Modified By: Ulrich Pfeifer
8 # Last Modified On: Fri Jan 4 16:06:14 2002
9 # Language : CPerl
10 # Update Count : 14
11 # Status : Unknown, Use with caution!
12 #
13 # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14 #
15 #
16
17 package WAIT::Parse::Ora;
18 use HTML::Parser;
19 use Encode;
20 use strict;
21 use vars qw(@ISA);
22 @ISA = qw(WAIT::Parse::Base);
23
24
25 =pod
26
27 Text from 2002-03-05 is structured with <div> tags as follows:
28
29 index.html:
30 <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31 <div id="short_desc"> KURZE BESCHREIBUNG
32
33 desc.html:
34 <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35
36 author.html:
37 <div id="author_bio"> BIOGRAPHIE DES AUTOREN
38
39 translator.html:
40 <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41
42 =cut
43
44 my $debug = 0;
45 my %is_text = (
46 p => 'text',
47 a => 'text', # uebersetzer
48 # h1 => 'text',
49 # h2 => 'text',
50 # h3 => 'text',
51 title => 'title',
52 );
53
54 my $p = HTML::Parser->new(
55 api_version => 3,
56 start_h => [\&handle_start, "tagname, attr"],
57 end_h => [\&handle_end, "tagname"],
58 text_h => [\&handle_text, "dtext"],
59 marked_sections => 1,
60 );
61 my %result;
62 my $text;
63 my $open;
64
65 sub handle_start {
66 my $tag = shift;
67 my $attr = shift;
68
69 return unless
70 $is_text{$tag} # well-formed paragraphs
71 ||
72 $tag eq "h3" # good for desc, author, and colo
73 ||
74 ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
75 $open++;
76 print ">" x $open, $tag, "\n" if $debug;
77 }
78
79 sub handle_end {
80 my $tag = shift;
81
82 return unless $is_text{$tag};
83 print "<" x $open, $tag, "\n" if $debug;
84 $open--;
85 $text =~ s/^\s+//;
86 $text =~ s/\s+$//;
87 $text =~ s/\s+/ /g;
88 $result{$is_text{$tag}} .= $text . ' ';
89 $text = '';
90 }
91
92
93 sub handle_text {
94 my $c = shift;
95 if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
96 $open--;
97 return;
98 }
99 $text .= $c if $open;
100 }
101
102 sub my_parse ($) {
103 my($s) = @_;
104 my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
105 # LATIN for entities
106 # and we would get
107 # mixed content in
108 # result
109 $p->parse($ls);
110 $p->eof;
111 }
112
113 sub split {
114 my ($self, $doc) = @_;
115 my %doc = ( isbn => '',
116 author => '',
117 aboutauthor => '',
118 colophon => '',
119 abstract => ''
120 );
121
122 if ($doc->{author}) {
123 %result = ();
124 $text = '';
125 $open = 0;
126 my_parse($doc->{author});
127 $doc{author} = $result{title};
128 $doc{aboutauthor} = $result{text};
129 }
130 if ($doc->{index}) {
131 $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
132 %result = ();
133 $text = '';
134 $open = 0;
135 my_parse($doc->{index});
136 $doc{abstract} = $result{text};
137 }
138 if ($doc->{colophon}) {
139 %result = ();
140 $text = '';
141 $open = 0;
142 my_parse($doc->{colophon});
143 $doc{colophon} = $result{text};
144 }
145 %result = ();
146 $text = '';
147 $open = 0;
148
149 my_parse($doc->{desc});
150
151 $doc{desc} = $result{text};
152 $doc{title} = $result{title};
153
154 while (my($k,$v) = each %doc) {
155 my $utf8v = Encode::decode("ISO-8859-1",$v);
156 $doc{$k} = $utf8v;
157 }
158
159 $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
160 $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
161 $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
162 # warn "desc[$doc{desc}]";
163 # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
164
165 return \%doc;
166 }
167
168 1;

Properties

Name Value
cvs2svn:cvs-rev 1.7

  ViewVC Help
Powered by ViewVC 1.1.26