/[webpac2]/branches/Sack/lib/WebPAC/Input/ISI.pm
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /branches/Sack/lib/WebPAC/Input/ISI.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1305 - (show annotations)
Sun Sep 20 22:26:27 2009 UTC (14 years, 7 months ago) by dpavlin
Original Path: trunk/lib/WebPAC/Input/ISI.pm
File size: 4311 byte(s)
implement experimental (and probably broken) low-level offset and limit

1 package WebPAC::Input::ISI;
2
3 use warnings;
4 use strict;
5
6 use WebPAC::Input;
7 use base qw/WebPAC::Common/;
8
9 use Data::Dump qw/dump/;
10
11 =head1 NAME
12
13 WebPAC::Input::ISI - support for ISI Export Format
14
15 =cut
16
17 our $VERSION = '0.03';
18
19 =head1 SYNOPSIS
20
21 Open file in ISI export fromat
22
23 my $input = new WebPAC::Input::ISI(
24 path => '/path/to/ISI/records.txt',
25 );
26
27 =head1 FUNCTIONS
28
29 =head2 new
30
31 Returns new low-level input API object
32
33 my $input = new WebPAC::Input::ISI(
34 path => '/path/to/ISI/records.txt'
35 filter => sub {
36 my ($l,$field_nr) = @_;
37 # do something with $l which is line of input file
38 return $l;
39 },
40 }
41
42 Options:
43
44 =over 4
45
46 =item path
47
48 path to ISI export file
49
50 =back
51
52 =cut
53
54 my $subfields = {
55 'CR' => sub {
56 my $full_cr = shift;
57 my @v = split(/, /, $full_cr);
58 my $f = { full => $full_cr };
59 foreach ( qw/author year reference volume page doi/ ) {
60 if ( my $tmp = shift @v ) {
61 $f->{$_} = $tmp;
62 }
63 }
64 if ( $f->{author} =~ /^\*(.+)/ ) {
65 delete $f->{author};
66 $f->{institution} = $1;
67 }
68 $f->{doi} =~ s{DOI\s+}{} if $f->{doi}; # strip DOI prefix
69 return $f;
70 },
71 };
72
73 sub new {
74 my $class = shift;
75 my $self = {@_};
76 bless($self, $class);
77
78 my $arg = {@_};
79
80 my $log = $self->_get_logger();
81
82 open( my $fh, '<', $arg->{path} ) || $log->logconfess("can't open $arg->{path}: $!");
83
84 my ( $format, $version );
85
86 my $line = <$fh>;
87 chomp($line);
88 if ( $line =~ /^FN\s(.+)$/) {
89 $format = $1;
90 } else {
91 $log->logdie("first line of $arg->{path} has to be FN, but is: $line");
92 }
93
94 $line = <$fh>;
95 chomp($line);
96 if ( $line =~ /^VR\s(.+)$/) {
97 $version = $1;
98 } else {
99 $log->logdie("second line of $arg->{path} has to be VN, but is: $line");
100 }
101
102 $log->info("opening $format $version database '$arg->{path}'");
103
104 my $tag;
105 my $rec;
106
107 my $offset = $self->{offset} || 0;
108 my $limit = $self->{limit} || 0;
109
110 my $file_pos = 0;
111 my $end_pos = 0;
112 $end_pos = $offset + $limit if $limit;
113
114 $self->{_rec} = [];
115
116 warn "# offset: $offset limit: $limit end: $end_pos";
117
118 while( $line = <$fh> ) {
119 chomp($line);
120 my $v;
121
122 if ( $line eq 'EF' ) {
123 last;
124 } elsif ( $line eq 'ER' ) {
125 $file_pos++;
126 last if $end_pos && $file_pos > $end_pos;
127
128 if ( ! $offset || $file_pos > $offset ) {
129
130 # join tags
131 foreach ( qw/AB DE ID TI SO RP SC FU FX PA JI/ ) {
132 $rec->{$_} = join(' ', @{ $rec->{$_} }) if defined $rec->{$_};
133 }
134 # split on ;
135 foreach ( qw/ID SC DE/ ) {
136 $rec->{$_} = [ split(/;\s/, $rec->{$_}) ] if defined $rec->{$_};
137 }
138 $rec->{'000'} = [ $file_pos ];
139 push @{ $self->{_rec} }, $rec;
140 }
141
142 $rec = {};
143 $line = <$fh>;
144 chomp $line;
145 $log->logdie("expected blank like in ",$arg->{path}, " +$.: $line") unless ( $line eq '' );
146 } elsif ( $offset && $file_pos < $offset ) {
147 next;
148 } elsif ( $line =~ /^(\S\S)\s(.+)$/ ) {
149 $tag = $1;
150 $v = $2;
151 } elsif ( $line =~ /^\s{3}(.+)$/ ) {
152 $v = $1;
153 if ( $tag eq 'CR' && $v =~ m{DOI$} ) {
154 my $doi = <$fh>;
155 chomp($doi);
156 $doi =~ s{^\s{3}}{ } || die "can't find DOI in: $doi";
157 $v .= $doi;
158 }
159 } elsif ( $line =~ m{^(\S\S)\s*$} ) {
160 warn "# $arg->{path} +$. empty |$line|\n";
161 } else {
162 $log->logdie("can't parse +$. $arg->{path} |$line|");
163 }
164
165 if ( defined $v ) {
166 $v = $subfields->{$tag}->($v) if defined $subfields->{$tag};
167
168 $log->debug("$tag: ", sub { dump( $v ) });
169 push @{ $rec->{$tag} }, $v;
170 }
171
172 }
173 $log->debug("loaded ", $self->size, " records");
174
175 $self ? return $self : return undef;
176 }
177
178 =head2 fetch_rec
179
180 Return record with ID C<$mfn> from database
181
182 my $rec = $input->fetch_rec( $mfn, $filter_coderef );
183
184 =cut
185
186 sub fetch_rec {
187 my $self = shift;
188
189 my ( $mfn, $filter_coderef ) = @_;
190
191 $mfn -= $self->{offset} if $self->{offset};
192 return $self->{_rec}->[$mfn-1];
193 }
194
195
196 =head2 size
197
198 Return number of records in database
199
200 my $size = $input->size;
201
202 =cut
203
204 sub size {
205 my $self = shift;
206 $#{ $self->{_rec} } + 1;
207 }
208
209 =head1 SEE ALSO
210
211 L<http://isibasic.com/help/helpprn.html> is only sane source of document format which Google could find...
212
213 =head1 AUTHOR
214
215 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
216
217 =head1 COPYRIGHT & LICENSE
218
219 Copyright 2007 Dobrica Pavlinusic, All Rights Reserved.
220
221 This program is free software; you can redistribute it and/or modify it
222 under the same terms as Perl itself.
223
224 =cut
225
226 1; # End of WebPAC::Input::ISI

  ViewVC Help
Powered by ViewVC 1.1.26