/[webpac2]/branches/Sack/lib/WebPAC/Input/ISI.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/Sack/lib/WebPAC/Input/ISI.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1315 - (show annotations)
Sun Oct 4 12:32:19 2009 UTC (14 years, 7 months ago) by dpavlin
File size: 4339 byte(s)
fix warning, return correct size
1 package WebPAC::Input::ISI;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8
9 use Data::Dump qw/dump/;
10 use Carp qw/confess/;
11
12 =head1 NAME
13
14 WebPAC::Input::ISI - support for ISI Export Format
15
16 =cut
17
18 our $VERSION = '0.04';
19
20 our $debug = 0;
21
22
23 =head1 SYNOPSIS
24
25 Open file in ISI export fromat
26
27 my $input = new WebPAC::Input::ISI(
28 path => '/path/to/ISI/records.txt',
29 );
30
31 =head1 FUNCTIONS
32
33 =head2 new
34
35 Returns new low-level input API object
36
37 my $input = new WebPAC::Input::ISI(
38 path => '/path/to/ISI/records.txt'
39 filter => sub {
40 my ($l,$field_nr) = @_;
41 # do something with $l which is line of input file
42 return $l;
43 },
44 }
45
46 Options:
47
48 =over 4
49
50 =item path
51
52 path to ISI export file
53
54 =back
55
56 =cut
57
58 our $subfields = {
59 'CR' => sub {
60 my $full_cr = shift;
61 my @v = split(/, /, $full_cr);
62 my $f = { full => $full_cr };
63 foreach ( qw/author year reference volume page doi/ ) {
64 if ( my $tmp = shift @v ) {
65 $f->{$_} = $tmp;
66 }
67 }
68 if ( $f->{author} =~ /^\*(.+)/ ) {
69 delete $f->{author};
70 $f->{institution} = $1;
71 }
72 $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix
73 return $f;
74 },
75 };
76
77 sub new {
78 my $class = shift;
79 my $self = {@_};
80 bless($self, $class);
81
82 my $arg = {@_};
83
84 open( my $fh, '<', $arg->{path} ) || confess "can't open $arg->{path}: $!";
85
86 my ( $format, $version );
87
88 my $line = <$fh>;
89 chomp($line);
90 if ( $line =~ /^FN\s(.+)$/) {
91 $format = $1;
92 } else {
93 die "first line of $arg->{path} has to be FN, but is: $line";
94 }
95
96 $line = <$fh>;
97 chomp($line);
98 if ( $line =~ /^VR\s(.+)$/) {
99 $version = $1;
100 } else {
101 die "second line of $arg->{path} has to be VN, but is: $line";
102 }
103
104 warn "I: $arg->{path} $format $version - generating record offsets\n";
105
106 $self->{fh} = $fh;
107 $self->{record_offset} = [];
108 $self->{offset} ||= 0;
109
110 while( $line = <$fh> ) {
111 chomp($line);
112 next unless $line eq 'ER';
113 push @{ $self->{record_offset} }, tell($fh);
114 last if $self->{limit} && $#{ $self->{record_offset} } >= $self->{limit} - 1 + $self->{offset};
115 }
116 push @{ $self->{record_offset} }, tell($fh); # end of file
117
118 warn "I $arg->{path} read ", tell($fh), " bytes $#{ $self->{record_offset} } records\n";
119
120 return $self;
121 }
122
123
124
125 =head2 fetch_rec
126
127 Return record with ID C<$mfn> from database
128
129 my $rec = $input->fetch_rec( $mfn, $filter_coderef );
130
131 =cut
132
133 sub fetch_rec {
134 my ( $self, $mfn, $filter_coderef ) = @_;
135
136 seek $self->{fh}, $self->{record_offset}->[ $mfn - 1 ], 0;
137
138 my $tag;
139 my $rec;
140
141 my $fh = $self->{fh};
142
143 while( my $line = <$fh> ) {
144 chomp($line);
145 my $v;
146
147 if ( $line eq 'EF' ) {
148 return;
149 } elsif ( $line eq 'ER' ) {
150
151 $line = <$fh>;
152 chomp $line;
153 die "expected blank like in ",$self->{path}, " +$.: $line" unless $line eq '';
154
155 # join tags
156 foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) {
157 $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_};
158 }
159
160 # split on ;
161 foreach ( qw/ID SC DE/ ) {
162 $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_};
163 }
164
165 $rec->{'000'} = [ $mfn ];
166 warn "## mfn $mfn" if $debug;
167 return $rec;
168
169 } elsif ( $line =~ /^(\S\S)\s(.+)$/ ) {
170 $tag = $1;
171 $v = $2;
172 } elsif ( $line =~ /^\s{3}(.+)$/ ) {
173 $v = $1;
174 if ( $tag eq 'CR' && $v =~ m{DOI$} ) {
175 my $doi = <$fh>;
176 chomp($doi);
177 $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi";
178 $v .= $doi;
179 }
180 } elsif ( $line =~ m{^(\S\S)\s*$} ) {
181 warn "# $self->{path} +$. empty |$line|\n";
182 } elsif ( $line ne '' ) {
183 warn "E: $self->{path} +$ | can't parse |$line|";
184 }
185
186 if ( defined $v ) {
187 $v = $subfields->{$tag}->($v) if defined $subfields->{$tag};
188
189 warn "## $tag: ", sub { dump( $v ) } if $debug;
190 push @{ $rec->{$tag} }, $v;
191 }
192 }
193
194 warn "can't get full record $mfn got ", dump $rec;
195 return $rec;
196 }
197
198
199 =head2 size
200
201 Return number of records in database
202
203 my $size = $input->size;
204
205 =cut
206
207 sub size {
208 my $self = shift;
209 my $size = $#{ $self->{record_offset} };
210 return 0 if $size < 0;
211 # no need for +1 since we record end of file as last record
212 return $size - $self->{offset};
213 }
214
215
216 =head1 AUTHOR
217
218 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
219
220 =head1 COPYRIGHT & LICENSE
221
222 Copyright 2009 Dobrica Pavlinusic, All Rights Reserved.
223
224 This program is free software; you can redistribute it and/or modify it
225 under the same terms as Perl itself.
226
227 =cut
228
229 1; # End of WebPAC::Input::ISI

  ViewVC Help
Powered by ViewVC 1.1.26