--- cvs-head/lib/WAIT/Document/Ora.pm 2002/01/23 12:22:54 65 +++ cvs-head/lib/WAIT/Document/Ora.pm 2002/01/25 07:27:30 69 @@ -14,9 +14,10 @@ package WAIT::Document::Ora; @ISA = qw(WAIT::Document::Base); -require WAIT::Document::Base; +use WAIT::Document::Base; use IO::File; +use Encode; use strict; use Carp; @@ -51,13 +52,13 @@ local($/) = undef; my $fh = IO::File->new(join('/',$self->{Dir},$file,'desc.html')); - my $desc = $fh->getline(); + my $desc = conv_getline($fh); $fh = IO::File->new(join('/',$self->{Dir},$file,'author.html')); - my $author = $fh->getline() if $fh; + my $author = conv_getline($fh) if $fh; $fh = IO::File->new(join('/',$self->{Dir},$file,'index.html')); - my $index = $fh->getline() if $fh; + my $index = conv_getline($fh) if $fh; $fh = IO::File->new(join('/',$self->{Dir},$file,'colophon.html')); - my $colophon = $fh->getline() if $fh; + my $colophon = conv_getline($fh) if $fh; return { desc => $desc, author => $author, @@ -66,6 +67,28 @@ }; } +# WAIT::Document::Ora::conv_getline +sub conv_getline ($) { + my($fh) = shift; + local $/ = "\n"; + my $firstline = <$fh>; + my $src_enc; + # \042 is double quote, \047 is single quote. I avoid single quotes + # here just for easier copy and paste to the terminal (I need to + # debug here frequently) + if ($firstline =~ /<\?xml[^>]+encoding\s*=([\042\047])([\w\-]+)\1/) { + $src_enc = $2; + } else { + $src_enc = "ISO-8859-1"; + } + seek $fh, 0, 0; + undef $/; + my $content = <$fh>; + $content =~ s/\s+/ /gs; # eliminate TABs and CRs for easier debugging + my $dcontent = Encode::decode($src_enc,$content); + $dcontent; +} + sub FIRSTKEY { my $self = shift; $self->{fno} = 0;