/[webpac2]/branches/Sack/lib/WebPAC/Input/ISI.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /branches/Sack/lib/WebPAC/Input/ISI.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1310 - (hide annotations)
Mon Sep 21 19:04:14 2009 UTC (14 years, 7 months ago) by dpavlin
File size: 4311 byte(s)
branch for refactoring of WebPAC::Input::* modules for Sack

1 dpavlin 898 package WebPAC::Input::ISI;
2    
3     use warnings;
4     use strict;
5    
6     use WebPAC::Input;
7     use base qw/WebPAC::Common/;
8    
9 dpavlin 904 use Data::Dump qw/dump/;
10    
11 dpavlin 898 =head1 NAME
12    
13     WebPAC::Input::ISI - support for ISI Export Format
14    
15     =cut
16    
17 dpavlin 1209 our $VERSION = '0.03';
18 dpavlin 898
19     =head1 SYNOPSIS
20    
21     Open file in ISI export fromat
22    
23     my $input = new WebPAC::Input::ISI(
24     path => '/path/to/ISI/records.txt',
25     );
26    
27     =head1 FUNCTIONS
28    
29     =head2 new
30    
31     Returns new low-level input API object
32    
33     my $input = new WebPAC::Input::ISI(
34     path => '/path/to/ISI/records.txt'
35     filter => sub {
36     my ($l,$field_nr) = @_;
37     # do something with $l which is line of input file
38     return $l;
39     },
40     }
41    
42     Options:
43    
44     =over 4
45    
46     =item path
47    
48     path to ISI export file
49    
50     =back
51    
52     =cut
53    
54 dpavlin 901 my $subfields = {
55     'CR' => sub {
56 dpavlin 1194 my $full_cr = shift;
57     my @v = split(/, /, $full_cr);
58     my $f = { full => $full_cr };
59 dpavlin 1209 foreach ( qw/author year reference volume page doi/ ) {
60 dpavlin 901 if ( my $tmp = shift @v ) {
61     $f->{$_} = $tmp;
62     }
63     }
64     if ( $f->{author} =~ /^\*(.+)/ ) {
65     delete $f->{author};
66     $f->{institution} = $1;
67     }
68 dpavlin 1209 $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix
69 dpavlin 901 return $f;
70     },
71     };
72    
73 dpavlin 898 sub new {
74     my $class = shift;
75     my $self = {@_};
76     bless($self, $class);
77    
78     my $arg = {@_};
79    
80     my $log = $self->_get_logger();
81    
82     open( my $fh, '<', $arg->{path} ) || $log->logconfess("can't open $arg->{path}: $!");
83    
84     my ( $format, $version );
85    
86     my $line = <$fh>;
87     chomp($line);
88     if ( $line =~ /^FN\s(.+)$/) {
89     $format = $1;
90     } else {
91     $log->logdie("first line of $arg->{path} has to be FN, but is: $line");
92     }
93    
94     $line = <$fh>;
95     chomp($line);
96     if ( $line =~ /^VR\s(.+)$/) {
97     $version = $1;
98     } else {
99     $log->logdie("second line of $arg->{path} has to be VN, but is: $line");
100     }
101    
102     $log->info("opening $format $version database '$arg->{path}'");
103    
104 dpavlin 899 my $tag;
105     my $rec;
106    
107 dpavlin 1303 my $offset = $self->{offset} || 0;
108 dpavlin 1305 my $limit = $self->{limit} || 0;
109 dpavlin 1077
110 dpavlin 1305 my $file_pos = 0;
111     my $end_pos = 0;
112     $end_pos = $offset + $limit if $limit;
113 dpavlin 1287
114 dpavlin 1305 $self->{_rec} = [];
115    
116     warn "# offset: $offset limit: $limit end: $end_pos";
117    
118 dpavlin 899 while( $line = <$fh> ) {
119     chomp($line);
120     my $v;
121    
122 dpavlin 1303 if ( $line eq 'EF' ) {
123     last;
124 dpavlin 899 } elsif ( $line eq 'ER' ) {
125 dpavlin 1305 $file_pos++;
126     last if $end_pos && $file_pos > $end_pos;
127 dpavlin 1303
128 dpavlin 1305 if ( ! $offset || $file_pos > $offset ) {
129    
130     # join tags
131     foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) {
132     $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_};
133     }
134     # split on ;
135     foreach ( qw/ID SC DE/ ) {
136     $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_};
137     }
138     $rec->{'000'} = [ $file_pos ];
139     push @{ $self->{_rec} }, $rec;
140 dpavlin 901 }
141 dpavlin 1287
142 dpavlin 899 $rec = {};
143     $line = <$fh>;
144     chomp $line;
145     $log->logdie("expected blank like in ",$arg->{path}, " +$.: $line") unless ( $line eq '' );
146 dpavlin 1305 } elsif ( $offset && $file_pos < $offset ) {
147 dpavlin 1303 next;
148     } elsif ( $line =~ /^(\S\S)\s(.+)$/ ) {
149 dpavlin 1305 $tag = $1;
150     $v = $2;
151 dpavlin 1303 } elsif ( $line =~ /^\s{3}(.+)$/ ) {
152 dpavlin 1305 $v = $1;
153     if ( $tag eq 'CR' && $v =~ m{DOI$} ) {
154     my $doi = <$fh>;
155     chomp($doi);
156     $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi";
157     $v .= $doi;
158     }
159 dpavlin 1302 } elsif ( $line =~ m{^(\S\S)\s*$} ) {
160     warn "# $arg->{path} +$. empty |$line|\n";
161 dpavlin 899 } else {
162 dpavlin 1302 $log->logdie("can't parse +$. $arg->{path} |$line|");
163 dpavlin 899 }
164    
165 dpavlin 904 if ( defined $v ) {
166     $v = $subfields->{$tag}->($v) if defined $subfields->{$tag};
167 dpavlin 901
168 dpavlin 904 $log->debug("$tag: ", sub { dump( $v ) });
169     push @{ $rec->{$tag} }, $v;
170     }
171 dpavlin 899
172     }
173     $log->debug("loaded ", $self->size, " records");
174    
175 dpavlin 898 $self ? return $self : return undef;
176     }
177    
178     =head2 fetch_rec
179    
180     Return record with ID C<$mfn> from database
181    
182 dpavlin 899 my $rec = $input->fetch_rec( $mfn, $filter_coderef );
183 dpavlin 898
184     =cut
185    
186     sub fetch_rec {
187     my $self = shift;
188    
189 dpavlin 899 my ( $mfn, $filter_coderef ) = @_;
190 dpavlin 898
191 dpavlin 1305 $mfn -= $self->{offset} if $self->{offset};
192 dpavlin 899 return $self->{_rec}->[$mfn-1];
193 dpavlin 898 }
194    
195    
196     =head2 size
197    
198     Return number of records in database
199    
200     my $size = $input->size;
201    
202     =cut
203    
204     sub size {
205     my $self = shift;
206 dpavlin 1305 $#{ $self->{_rec} } + 1;
207 dpavlin 898 }
208    
209 dpavlin 900 =head1 SEE ALSO
210    
211     L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
212 dpavlin 901
213 dpavlin 898 =head1 AUTHOR
214    
215     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
216    
217     =head1 COPYRIGHT & LICENSE
218    
219     Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
220    
221     This program is free software; you can redistribute it and/or modify it
222     under the same terms as Perl itself.
223    
224     =cut
225    
226     1; # End of WebPAC::Input::ISI

  ViewVC Help
Powered by ViewVC 1.1.26