1 |
#!/usr/bin/perl |
2 |
# -*- Mode: Perl -*- |
3 |
# $Basename: HTML.pm $ |
4 |
# $Revision: 1.2 $ |
5 |
# Author : Ulrich Pfeifer with Andreas König |
6 |
# Created On : Sat Nov 1 1997 |
7 |
# Last Modified By: Ulrich Pfeifer |
8 |
# Last Modified On: Wed Nov 5 16:48:17 1997 |
9 |
# Language : CPerl |
10 |
# Update Count : 1 |
11 |
# Status : Unknown, Use with caution! |
12 |
# |
13 |
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved. |
14 |
# |
15 |
# |
16 |
|
17 |
package WAIT::Parse::HTML; |
18 |
use vars qw(@ISA); |
19 |
require HTML::Parse; |
20 |
require HTML::FormatText; |
21 |
use HTML::Entities qw(decode_entities); |
22 |
@ISA = qw(WAIT::Parse::Base); |
23 |
|
24 |
|
25 |
sub split { |
26 |
my ($self, $html_source) = @_; |
27 |
|
28 |
my (undef,$title) = $html_source =~ /<(title|h1|h2|h3|h4)[^>]*>(.*?)<\/\1\s*>/si; |
29 |
|
30 |
my $html = HTML::Parse::parse_html($html_source); |
31 |
my $formatter = HTML::FormatText->new; |
32 |
|
33 |
{ |
34 |
'text', $formatter->format($html), |
35 |
'title', $title || 'no title', |
36 |
}; |
37 |
} |
38 |
|
39 |
sub tag { |
40 |
my ($self, $html_source) = @_; |
41 |
|
42 |
$html_source =~ tr/\r/\n/; |
43 |
|
44 |
my ($pre,$title,$body) |
45 |
= $html_source =~ /^(.*?<title\s*>)(.*?)(<\/title\s*>.+)/si; |
46 |
|
47 |
$pre .= ''; |
48 |
$title .= ''; |
49 |
$body .= ''; |
50 |
|
51 |
( |
52 |
{'text' => 1}, decode_entities($pre), |
53 |
{'title' => 2}, decode_entities($title), |
54 |
{'text' => 1}, decode_entities($body), |
55 |
); |
56 |
} |
57 |
|
58 |
1; |