/[webpac2]/branches/Sack/lib/WebPAC/Input/ISI.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /branches/Sack/lib/WebPAC/Input/ISI.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1315 - (hide annotations)
Sun Oct 4 12:32:19 2009 UTC (14 years, 7 months ago) by dpavlin
File size: 4339 byte(s)
fix warning, return correct size
1 dpavlin 898 package WebPAC::Input::ISI;
2    
3     use warnings;
4     use strict;
5    
6     use WebPAC::Input;
7     use base qw/WebPAC::Common/;
8    
9 dpavlin 904 use Data::Dump qw/dump/;
10 dpavlin 1311 use Carp qw/confess/;
11 dpavlin 904
12 dpavlin 898 =head1 NAME
13    
14     WebPAC::Input::ISI - support for ISI Export Format
15    
16     =cut
17    
18 dpavlin 1311 our $VERSION = '0.04';
19 dpavlin 898
20 dpavlin 1311 our $debug = 0;
21    
22    
23 dpavlin 898 =head1 SYNOPSIS
24    
25     Open file in ISI export fromat
26    
27     my $input = new WebPAC::Input::ISI(
28     path => '/path/to/ISI/records.txt',
29     );
30    
31     =head1 FUNCTIONS
32    
33     =head2 new
34    
35     Returns new low-level input API object
36    
37     my $input = new WebPAC::Input::ISI(
38     path => '/path/to/ISI/records.txt'
39     filter => sub {
40     my ($l,$field_nr) = @_;
41     # do something with $l which is line of input file
42     return $l;
43     },
44     }
45    
46     Options:
47    
48     =over 4
49    
50     =item path
51    
52     path to ISI export file
53    
54     =back
55    
56     =cut
57    
58 dpavlin 1314 our $subfields = {
59 dpavlin 901 'CR' => sub {
60 dpavlin 1194 my $full_cr = shift;
61     my @v = split(/, /, $full_cr);
62     my $f = { full => $full_cr };
63 dpavlin 1209 foreach ( qw/author year reference volume page doi/ ) {
64 dpavlin 901 if ( my $tmp = shift @v ) {
65     $f->{$_} = $tmp;
66     }
67     }
68     if ( $f->{author} =~ /^\*(.+)/ ) {
69     delete $f->{author};
70     $f->{institution} = $1;
71     }
72 dpavlin 1209 $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix
73 dpavlin 901 return $f;
74     },
75     };
76    
77 dpavlin 898 sub new {
78     my $class = shift;
79     my $self = {@_};
80     bless($self, $class);
81    
82     my $arg = {@_};
83    
84 dpavlin 1311 open( my $fh, '<', $arg->{path} ) || confess "can't open $arg->{path}: $!";
85 dpavlin 898
86     my ( $format, $version );
87    
88     my $line = <$fh>;
89     chomp($line);
90     if ( $line =~ /^FN\s(.+)$/) {
91     $format = $1;
92     } else {
93 dpavlin 1311 die "first line of $arg->{path} has to be FN, but is: $line";
94 dpavlin 898 }
95    
96     $line = <$fh>;
97     chomp($line);
98     if ( $line =~ /^VR\s(.+)$/) {
99     $version = $1;
100     } else {
101 dpavlin 1311 die "second line of $arg->{path} has to be VN, but is: $line";
102 dpavlin 898 }
103    
104 dpavlin 1311 warn "I: $arg->{path} $format $version - generating record offsets\n";
105 dpavlin 898
106 dpavlin 1311 $self->{fh} = $fh;
107     $self->{record_offset} = [];
108 dpavlin 1315 $self->{offset} ||= 0;
109 dpavlin 1311
110     while( $line = <$fh> ) {
111     chomp($line);
112     next unless $line eq 'ER';
113     push @{ $self->{record_offset} }, tell($fh);
114 dpavlin 1312 last if $self->{limit} && $#{ $self->{record_offset} } >= $self->{limit} - 1 + $self->{offset};
115 dpavlin 1311 }
116     push @{ $self->{record_offset} }, tell($fh); # end of file
117    
118     warn "I $arg->{path} read ", tell($fh), " bytes $#{ $self->{record_offset} } records\n";
119    
120     return $self;
121     }
122    
123    
124    
125     =head2 fetch_rec
126    
127     Return record with ID C<$mfn> from database
128    
129     my $rec = $input->fetch_rec( $mfn, $filter_coderef );
130    
131     =cut
132    
133     sub fetch_rec {
134     my ( $self, $mfn, $filter_coderef ) = @_;
135    
136     seek $self->{fh}, $self->{record_offset}->[ $mfn - 1 ], 0;
137    
138 dpavlin 899 my $tag;
139     my $rec;
140    
141 dpavlin 1311 my $fh = $self->{fh};
142 dpavlin 1077
143 dpavlin 1311 while( my $line = <$fh> ) {
144 dpavlin 899 chomp($line);
145     my $v;
146    
147 dpavlin 1303 if ( $line eq 'EF' ) {
148 dpavlin 1311 return;
149 dpavlin 899 } elsif ( $line eq 'ER' ) {
150 dpavlin 1303
151 dpavlin 1311 $line = <$fh>;
152     chomp $line;
153     die "expected blank like in ",$self->{path}, " +$.: $line" unless $line eq '';
154 dpavlin 1305
155 dpavlin 1311 # join tags
156     foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) {
157     $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_};
158 dpavlin 901 }
159 dpavlin 1287
160 dpavlin 1311 # split on ;
161     foreach ( qw/ID SC DE/ ) {
162     $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_};
163     }
164    
165     $rec->{'000'} = [ $mfn ];
166     warn "## mfn $mfn" if $debug;
167     return $rec;
168    
169 dpavlin 1303 } elsif ( $line =~ /^(\S\S)\s(.+)$/ ) {
170 dpavlin 1305 $tag = $1;
171     $v = $2;
172 dpavlin 1303 } elsif ( $line =~ /^\s{3}(.+)$/ ) {
173 dpavlin 1305 $v = $1;
174     if ( $tag eq 'CR' && $v =~ m{DOI$} ) {
175     my $doi = <$fh>;
176     chomp($doi);
177     $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi";
178     $v .= $doi;
179     }
180 dpavlin 1302 } elsif ( $line =~ m{^(\S\S)\s*$} ) {
181 dpavlin 1311 warn "# $self->{path} +$. empty |$line|\n";
182     } elsif ( $line ne '' ) {
183     warn "E: $self->{path} +$ | can't parse |$line|";
184 dpavlin 899 }
185    
186 dpavlin 904 if ( defined $v ) {
187     $v = $subfields->{$tag}->($v) if defined $subfields->{$tag};
188 dpavlin 901
189 dpavlin 1311 warn "## $tag: ", sub { dump( $v ) } if $debug;
190 dpavlin 904 push @{ $rec->{$tag} }, $v;
191     }
192 dpavlin 899 }
193    
194 dpavlin 1311 warn "can't get full record $mfn got ", dump $rec;
195     return $rec;
196 dpavlin 898 }
197    
198    
199     =head2 size
200    
201     Return number of records in database
202    
203     my $size = $input->size;
204    
205     =cut
206    
207     sub size {
208     my $self = shift;
209 dpavlin 1315 my $size = $#{ $self->{record_offset} };
210     return 0 if $size < 0;
211 dpavlin 1311 # no need for +1 since we record end of file as last record
212 dpavlin 1315 return $size - $self->{offset};
213 dpavlin 898 }
214    
215 dpavlin 900
216 dpavlin 898 =head1 AUTHOR
217    
218     Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
219    
220     =head1 COPYRIGHT & LICENSE
221    
222 dpavlin 1311 Copyright 2009 Dobrica Pavlinusic, All Rights Reserved.
223 dpavlin 898
224     This program is free software; you can redistribute it and/or modify it
225     under the same terms as Perl itself.
226    
227     =cut
228    
229     1; # End of WebPAC::Input::ISI

  ViewVC Help
Powered by ViewVC 1.1.26