--- trunk/lib/WebPAC/Input/ISI.pm 2007/10/10 19:01:55 898 +++ branches/Sack/lib/WebPAC/Input/ISI.pm 2009/09/21 20:05:14 1312 @@ -6,17 +6,18 @@ use WebPAC::Input; use base qw/WebPAC::Common/; +use Data::Dump qw/dump/; +use Carp qw/confess/; + =head1 NAME WebPAC::Input::ISI - support for ISI Export Format -=head1 VERSION - -Version 0.00 - =cut -our $VERSION = '0.00'; +our $VERSION = '0.04'; + +our $debug = 0; =head1 SYNOPSIS @@ -54,6 +55,25 @@ =cut +my $subfields = { + 'CR' => sub { + my $full_cr = shift; + my @v = split(/, /, $full_cr); + my $f = { full => $full_cr }; + foreach ( qw/author year reference volume page doi/ ) { + if ( my $tmp = shift @v ) { + $f->{$_} = $tmp; + } + } + if ( $f->{author} =~ /^\*(.+)/ ) { + delete $f->{author}; + $f->{institution} = $1; + } + $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix + return $f; + }, +}; + sub new { my $class = shift; my $self = {@_}; @@ -61,9 +81,7 @@ my $arg = {@_}; - my $log = $self->_get_logger(); - - open( my $fh, '<', $arg->{path} ) || $log->logconfess("can't open $arg->{path}: $!"); + open( my $fh, '<', $arg->{path} ) || confess "can't open $arg->{path}: $!"; my ( $format, $version ); @@ -72,7 +90,7 @@ if ( $line =~ /^FN\s(.+)$/) { $format = $1; } else { - $log->logdie("first line of $arg->{path} has to be FN, but is: $line"); + die "first line of $arg->{path} has to be FN, but is: $line"; } $line = <$fh>; @@ -80,29 +98,99 @@ if ( $line =~ /^VR\s(.+)$/) { $version = $1; } else { - $log->logdie("second line of $arg->{path} has to be VN, but is: $line"); + die "second line of $arg->{path} has to be VN, but is: $line"; + } + + warn "I: $arg->{path} $format $version - generating record offsets\n"; + + $self->{fh} = $fh; + $self->{record_offset} = []; + + while( $line = <$fh> ) { + chomp($line); + next unless $line eq 'ER'; + push @{ $self->{record_offset} }, tell($fh); + last if $self->{limit} && $#{ $self->{record_offset} } >= $self->{limit} - 1 + $self->{offset}; } + push @{ $self->{record_offset} }, tell($fh); # end of file - $log->info("opening $format $version database '$arg->{path}'"); + warn "I $arg->{path} read ", tell($fh), " bytes $#{ $self->{record_offset} } records\n"; - $self ? return $self : return undef; + return $self; } + + =head2 fetch_rec Return record with ID C<$mfn> from database - my $rec = $input->fetch_rec( $mfn, $filter_coderef); + my $rec = $input->fetch_rec( $mfn, $filter_coderef ); =cut sub fetch_rec { - my $self = shift; + my ( $self, $mfn, $filter_coderef ) = @_; - my ($mfn, $filter_coderef) = @_; + seek $self->{fh}, $self->{record_offset}->[ $mfn - 1 ], 0; + my $tag; my $rec; + my $fh = $self->{fh}; + + while( my $line = <$fh> ) { + chomp($line); + my $v; + + if ( $line eq 'EF' ) { + return; + } elsif ( $line eq 'ER' ) { + + $line = <$fh>; + chomp $line; + die "expected blank like in ",$self->{path}, " +$.: $line" unless $line eq ''; + + # join tags + foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) { + $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_}; + } + + # split on ; + foreach ( qw/ID SC DE/ ) { + $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_}; + } + + $rec->{'000'} = [ $mfn ]; + warn "## mfn $mfn" if $debug; + return $rec; + + } elsif ( $line =~ /^(\S\S)\s(.+)$/ ) { + $tag = $1; + $v = $2; + } elsif ( $line =~ /^\s{3}(.+)$/ ) { + $v = $1; + if ( $tag eq 'CR' && $v =~ m{DOI$} ) { + my $doi = <$fh>; + chomp($doi); + $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi"; + $v .= $doi; + } + } elsif ( $line =~ m{^(\S\S)\s*$} ) { + warn "# $self->{path} +$. empty |$line|\n"; + } elsif ( $line ne '' ) { + warn "E: $self->{path} +$ | can't parse |$line|"; + } + + if ( defined $v ) { + $v = $subfields->{$tag}->($v) if defined $subfields->{$tag}; + + warn "## $tag: ", sub { dump( $v ) } if $debug; + push @{ $rec->{$tag} }, $v; + } + } + + warn "can't get full record $mfn got ", dump $rec; return $rec; } @@ -117,16 +205,18 @@ sub size { my $self = shift; - return 2; + $#{ $self->{record_offset} } - $self->{offset}; + # no need for +1 since we record end of file as last record } + =head1 AUTHOR Dobrica Pavlinusic, C<< >> =head1 COPYRIGHT & LICENSE -Copyright 2007 Dobrica Pavlinusic, All Rights Reserved. +Copyright 2009 Dobrica Pavlinusic, All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.