/[wait]/cvs-head/lib/WAIT/Parse/Ora.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /cvs-head/lib/WAIT/Parse/Ora.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 73 - (hide annotations)
Tue Mar 5 13:40:38 2002 UTC (22 years, 3 months ago) by laperla
File size: 4023 byte(s)
- Just a snapshot before we rewrite the indexer since the text is
  marked up better today.

1 ulpfr 54 #!/usr/bin/perl
2     # -*- Mode: Perl -*-
3     # $Basename: HTML.pm $
4 laperla 73 # $Revision: 1.7 $
5 ulpfr 54 # Author : Ulrich Pfeifer with Andreas König
6     # Created On : Sat Nov 1 1997
7     # Last Modified By: Ulrich Pfeifer
8 ulpfr 61 # Last Modified On: Fri Jan 4 16:06:14 2002
9 ulpfr 54 # Language : CPerl
10 ulpfr 61 # Update Count : 14
11 ulpfr 54 # Status : Unknown, Use with caution!
12     #
13     # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
14     #
15     #
16    
17     package WAIT::Parse::Ora;
18 ulpfr 58 use HTML::Parser;
19 laperla 69 use Encode;
20 ulpfr 58 use strict;
21 ulpfr 54 use vars qw(@ISA);
22     @ISA = qw(WAIT::Parse::Base);
23    
24 laperla 73
25     =pod
26    
27     Text from 2002-03-05 is structured with <div> tags as follows:
28    
29     index.html:
30     <div id="biblio"> BIBLIOGRAPHISCHE ANGABEN
31     <div id="short_desc"> KURZE BESCHREIBUNG
32    
33     desc.html:
34     <div id="long_desc"> AUSFUEHRLICHE BESCHREIBUNG
35    
36     author.html:
37     <div id="author_bio"> BIOGRAPHIE DES AUTOREN
38    
39     translator.html:
40     <div id="translator_bio"> BIOGRAPHIE DES UEBERSETZERS
41    
42     =cut
43    
44 ulpfr 58 my $debug = 0;
45 laperla 72 my %is_text = (
46 laperla 73 p => 'text',
47     a => 'text', # uebersetzer
48 ulpfr 58 # h1 => 'text',
49     # h2 => 'text',
50     # h3 => 'text',
51 laperla 73 title => 'title',
52 ulpfr 58 );
53 ulpfr 54
54 ulpfr 58 my $p = HTML::Parser->new(
55     api_version => 3,
56     start_h => [\&handle_start, "tagname, attr"],
57     end_h => [\&handle_end, "tagname"],
58     text_h => [\&handle_text, "dtext"],
59     marked_sections => 1,
60     );
61     my %result;
62     my $text;
63     my $open;
64    
65     sub handle_start {
66     my $tag = shift;
67 laperla 72 my $attr = shift;
68 ulpfr 58
69 laperla 72 return unless
70     $is_text{$tag} # well-formed paragraphs
71     ||
72     $tag eq "h3" # good for desc, author, and colo
73     ||
74     ($tag eq "font" && $attr->{size} && $attr->{size}==5); # good for index.html
75 ulpfr 58 $open++;
76     print ">" x $open, $tag, "\n" if $debug;
77     }
78    
79     sub handle_end {
80     my $tag = shift;
81    
82 laperla 72 return unless $is_text{$tag};
83 ulpfr 58 print "<" x $open, $tag, "\n" if $debug;
84     $open--;
85     $text =~ s/^\s+//;
86     $text =~ s/\s+$//;
87     $text =~ s/\s+/ /g;
88 laperla 72 $result{$is_text{$tag}} .= $text . ' ';
89 ulpfr 58 $text = '';
90     }
91    
92    
93     sub handle_text {
94 laperla 72 my $c = shift;
95     if ($open > 1 && $c =~ /^(Zur.{1,6}ck\s+zu|Erg.{1,6}nzende O'Reilly Titel)/) {
96     $open--;
97     return;
98     }
99     $text .= $c if $open;
100 ulpfr 58 }
101    
102 laperla 69 sub my_parse ($) {
103     my($s) = @_;
104     my $ls = Encode::encode("ISO-8859-1", $s, 1); # HTML::Parser returns
105     # LATIN for entities
106     # and we would get
107     # mixed content in
108     # result
109     $p->parse($ls);
110     $p->eof;
111     }
112    
113 ulpfr 54 sub split {
114     my ($self, $doc) = @_;
115 laperla 72 my %doc = ( isbn => '',
116     author => '',
117     aboutauthor => '',
118     colophon => '',
119     abstract => ''
120     );
121 ulpfr 54
122 ulpfr 61 if ($doc->{author}) {
123     %result = ();
124     $text = '';
125     $open = 0;
126 laperla 69 my_parse($doc->{author});
127 ulpfr 61 $doc{author} = $result{title};
128 laperla 72 $doc{aboutauthor} = $result{text};
129 ulpfr 61 }
130     if ($doc->{index}) {
131 laperla 72 $doc->{index} =~ /ISBN\s*([^\<]+)/ and $doc{isbn} = $1;
132     %result = ();
133     $text = '';
134     $open = 0;
135     my_parse($doc->{index});
136     $doc{abstract} = $result{text};
137 ulpfr 61 }
138 laperla 65 if ($doc->{colophon}) {
139     %result = ();
140     $text = '';
141     $open = 0;
142 laperla 69 my_parse($doc->{colophon});
143 laperla 65 $doc{colophon} = $result{text};
144     }
145 ulpfr 58 %result = ();
146     $text = '';
147     $open = 0;
148 ulpfr 54
149 laperla 69 my_parse($doc->{desc});
150 ulpfr 54
151 laperla 72 $doc{desc} = $result{text};
152 ulpfr 58 $doc{title} = $result{title};
153    
154 laperla 69 while (my($k,$v) = each %doc) {
155     my $utf8v = Encode::decode("ISO-8859-1",$v);
156     $doc{$k} = $utf8v;
157     }
158    
159 laperla 72 $doc{desc} =~ s/^\s*Ausf\S+hrliche\s+Beschreibung\s*//;
160     $doc{abstract} =~ s/\s*Titel\s+dem\s+Warenkorb\s+hinzu\S+\s*/ /;
161     $doc{abstract} =~ s/\s*Warenkorb\s+anzeigen\s*/ /;
162     # warn "desc[$doc{desc}]";
163     # warn "abstract[$doc{abstract}]"; # zu viel, zu viel!
164    
165 ulpfr 58 return \%doc;
166 ulpfr 54 }
167 ulpfr 58
168     1;

Properties

Name Value
cvs2svn:cvs-rev 1.7

  ViewVC Help
Powered by ViewVC 1.1.26