--- trunk/lib/WebPAC/Input/ISI.pm 2007/10/10 19:01:57 899 +++ trunk/lib/WebPAC/Input/ISI.pm 2009/09/19 08:41:06 1288 @@ -6,18 +6,15 @@ use WebPAC::Input; use base qw/WebPAC::Common/; +use Data::Dump qw/dump/; + =head1 NAME WebPAC::Input::ISI - support for ISI Export Format -=head1 VERSION - -Version 0.00 - =cut -our $VERSION = '0.00'; - +our $VERSION = '0.03'; =head1 SYNOPSIS @@ -54,6 +51,25 @@ =cut +my $subfields = { + 'CR' => sub { + my $full_cr = shift; + my @v = split(/, /, $full_cr); + my $f = { full => $full_cr }; + foreach ( qw/author year reference volume page doi/ ) { + if ( my $tmp = shift @v ) { + $f->{$_} = $tmp; + } + } + if ( $f->{author} =~ /^\*(.+)/ ) { + delete $f->{author}; + $f->{institution} = $1; + } + $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix + return $f; + }, +}; + sub new { my $class = shift; my $self = {@_}; @@ -88,6 +104,12 @@ my $tag; my $rec; + $self->{size} = 0; + my $max_size; + $max_size = ( $self->{offset} || 0 ) + $self->{limit} if $self->{limit}; + + warn "# max_size: $max_size"; + while( $line = <$fh> ) { chomp($line); @@ -98,8 +120,26 @@ $v = $2; } elsif ( $line =~ /^\s{3}(.+)$/ ) { $v = $1; + if ( $tag eq 'CR' && $v =~ m{DOI$} ) { + my $doi = <$fh>; + chomp($doi); + $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi"; + $v .= $doi; + } } elsif ( $line eq 'ER' ) { + # join tags + foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) { + $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_}; + } + # split on ; + foreach ( qw/ID SC DE/ ) { + $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_}; + } + $rec->{'000'} = [ ++$self->{size} ]; push @{ $self->{_rec} }, $rec; + + last if $max_size && $self->{size} == $max_size; + $rec = {}; $line = <$fh>; chomp $line; @@ -110,7 +150,12 @@ $log->logdie("can't parse +$. $arg->{path} : $line"); } - push @{ $rec->{$tag} }, $v; + if ( defined $v ) { + $v = $subfields->{$tag}->($v) if defined $subfields->{$tag}; + + $log->debug("$tag: ", sub { dump( $v ) }); + push @{ $rec->{$tag} }, $v; + } } @@ -146,9 +191,13 @@ sub size { my $self = shift; - return $#{$self->{_rec}} + 1; + return $self->{size}; } +=head1 SEE ALSO + +L is only sane source of document format which Google could find... + =head1 AUTHOR Dobrica Pavlinusic, C<< >>