--- trunk/lib/WebPAC/Input/ISI.pm 2007/10/10 19:01:57 899 +++ branches/Sack/lib/WebPAC/Input/ISI.pm 2009/09/21 20:05:14 1312 @@ -6,17 +6,18 @@ use WebPAC::Input; use base qw/WebPAC::Common/; +use Data::Dump qw/dump/; +use Carp qw/confess/; + =head1 NAME WebPAC::Input::ISI - support for ISI Export Format -=head1 VERSION - -Version 0.00 - =cut -our $VERSION = '0.00'; +our $VERSION = '0.04'; + +our $debug = 0; =head1 SYNOPSIS @@ -54,6 +55,25 @@ =cut +my $subfields = { + 'CR' => sub { + my $full_cr = shift; + my @v = split(/, /, $full_cr); + my $f = { full => $full_cr }; + foreach ( qw/author year reference volume page doi/ ) { + if ( my $tmp = shift @v ) { + $f->{$_} = $tmp; + } + } + if ( $f->{author} =~ /^\*(.+)/ ) { + delete $f->{author}; + $f->{institution} = $1; + } + $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix + return $f; + }, +}; + sub new { my $class = shift; my $self = {@_}; @@ -61,9 +81,7 @@ my $arg = {@_}; - my $log = $self->_get_logger(); - - open( my $fh, '<', $arg->{path} ) || $log->logconfess("can't open $arg->{path}: $!"); + open( my $fh, '<', $arg->{path} ) || confess "can't open $arg->{path}: $!"; my ( $format, $version ); @@ -72,7 +90,7 @@ if ( $line =~ /^FN\s(.+)$/) { $format = $1; } else { - $log->logdie("first line of $arg->{path} has to be FN, but is: $line"); + die "first line of $arg->{path} has to be FN, but is: $line"; } $line = <$fh>; @@ -80,45 +98,29 @@ if ( $line =~ /^VR\s(.+)$/) { $version = $1; } else { - $log->logdie("second line of $arg->{path} has to be VN, but is: $line"); + die "second line of $arg->{path} has to be VN, but is: $line"; } - $log->info("opening $format $version database '$arg->{path}'"); + warn "I: $arg->{path} $format $version - generating record offsets\n"; - my $tag; - my $rec; + $self->{fh} = $fh; + $self->{record_offset} = []; while( $line = <$fh> ) { chomp($line); - - my $v; - - if ( $line =~ /^(\S\S)\s(.+)$/ ) { - $tag = $1; - $v = $2; - } elsif ( $line =~ /^\s{3}(.+)$/ ) { - $v = $1; - } elsif ( $line eq 'ER' ) { - push @{ $self->{_rec} }, $rec; - $rec = {}; - $line = <$fh>; - chomp $line; - $log->logdie("expected blank like in ",$arg->{path}, " +$.: $line") unless ( $line eq '' ); - } elsif ( $line eq 'EF' ) { - last; - } else { - $log->logdie("can't parse +$. $arg->{path} : $line"); - } - - push @{ $rec->{$tag} }, $v; - + next unless $line eq 'ER'; + push @{ $self->{record_offset} }, tell($fh); + last if $self->{limit} && $#{ $self->{record_offset} } >= $self->{limit} - 1 + $self->{offset}; } + push @{ $self->{record_offset} }, tell($fh); # end of file - $log->debug("loaded ", $self->size, " records"); + warn "I $arg->{path} read ", tell($fh), " bytes $#{ $self->{record_offset} } records\n"; - $self ? return $self : return undef; + return $self; } + + =head2 fetch_rec Return record with ID C<$mfn> from database @@ -128,11 +130,68 @@ =cut sub fetch_rec { - my $self = shift; + my ( $self, $mfn, $filter_coderef ) = @_; + + seek $self->{fh}, $self->{record_offset}->[ $mfn - 1 ], 0; + + my $tag; + my $rec; + + my $fh = $self->{fh}; + + while( my $line = <$fh> ) { + chomp($line); + my $v; + + if ( $line eq 'EF' ) { + return; + } elsif ( $line eq 'ER' ) { + + $line = <$fh>; + chomp $line; + die "expected blank like in ",$self->{path}, " +$.: $line" unless $line eq ''; - my ( $mfn, $filter_coderef ) = @_; + # join tags + foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) { + $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_}; + } + + # split on ; + foreach ( qw/ID SC DE/ ) { + $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_}; + } + + $rec->{'000'} = [ $mfn ]; + warn "## mfn $mfn" if $debug; + return $rec; + + } elsif ( $line =~ /^(\S\S)\s(.+)$/ ) { + $tag = $1; + $v = $2; + } elsif ( $line =~ /^\s{3}(.+)$/ ) { + $v = $1; + if ( $tag eq 'CR' && $v =~ m{DOI$} ) { + my $doi = <$fh>; + chomp($doi); + $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi"; + $v .= $doi; + } + } elsif ( $line =~ m{^(\S\S)\s*$} ) { + warn "# $self->{path} +$. empty |$line|\n"; + } elsif ( $line ne '' ) { + warn "E: $self->{path} +$ | can't parse |$line|"; + } - return $self->{_rec}->[$mfn-1]; + if ( defined $v ) { + $v = $subfields->{$tag}->($v) if defined $subfields->{$tag}; + + warn "## $tag: ", sub { dump( $v ) } if $debug; + push @{ $rec->{$tag} }, $v; + } + } + + warn "can't get full record $mfn got ", dump $rec; + return $rec; } @@ -146,16 +205,18 @@ sub size { my $self = shift; - return $#{$self->{_rec}} + 1; + $#{ $self->{record_offset} } - $self->{offset}; + # no need for +1 since we record end of file as last record } + =head1 AUTHOR Dobrica Pavlinusic, C<< >> =head1 COPYRIGHT & LICENSE -Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. +Copyright 2009 Dobrica Pavlinusic, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.