/[MARC-Fast]/trunk/lib/MARC/Fast.pm

This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!

Diff of /trunk/lib/MARC/Fast.pm

Parent Directory | Revision Log | View Patch Patch

-revision 5 by dpavlin,
Sat Oct  8 16:33:09 2005 UTC
+revision 36 by dpavlin,
Thu Sep 23 12:24:38 2010 UTC
 Line 1
  package MARC::Fast;
  use strict;
  use Carp;
- use Data::Dumper;
+ use Data::Dump qw/dump/;
  BEGIN {
          use Exporter ();
          use vars qw ($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
-         $VERSION     = 0.02;
+         $VERSION     = 0.10;
          @ISA         = qw (Exporter);
          #Give a hoot don't pollute, do not export more than needed by default
          @EXPORT      = qw ();
 Line 23 
 MARC::Fast - Very fast implementation of
    use MARC::Fast;
+   my $marc = new MARC::Fast(
+         marcdb => 'unimarc.iso',
+   );
+   foreach my $mfn ( 1 .. $marc->count ) {
+         print $marc->to_ascii( $mfn );
+   }
+ For longer example with command line options look at L<scripts/dump_fastmarc.pl>
  =head1 DESCRIPTION
  This is very fast alternative to C<MARC> and C<MARC::Record> modules.
- It's is also very sutable for random access to MARC records (as opposed to
+ It's is also very subtable for random access to MARC records (as opposed to
  sequential one).
  =head1 METHODS
-Line 42 
 Read MARC database
+Line 51 
 Read MARC database
          quiet => 0,
          debug => 0,
          assert => 0,
+         hash_filter => sub {
+                 my ($t, $record_number) = @_;
+                 $t =~ s/foo/bar/;
+                 return $t;
+         },
    );
  =cut
-Line 59 
 sub new {
+Line 73 
 sub new {
          print STDERR "# opening ",$self->{marcdb},"\n" if ($self->{debug});
          open($self->{fh}, $self->{marcdb}) || croak "can't open ",$self->{marcdb},": $!";
+         binmode($self->{fh});
          $self->{count} = 0;
-Line 69 
 sub new {
+Line 84 
 sub new {
                  push @{$self->{fh_offset}}, tell($self->{fh});
                  my $leader;
-                 read($self->{fh}, $leader, 24);
+                 my $len = read($self->{fh}, $leader, 24);
+                 if ($len < 24) {
+                         warn "short read of leader, aborting\n";
+                         $self->{count}--;
+                         last;
+                 }
                  # Byte        Name
                  # ----        ----
-Line 100 
 sub new {
+Line 121 
 sub new {
                  print STDERR "REC ",$self->{count},": $leader\n" if ($self->{debug});
                  # store leader for later
-                 push @{$self->{leaders}}, $leader;
+                 push @{$self->{leader}}, $leader;
                  # skip to next record
-                 seek($self->{fh},substr($leader,0,5)-24,1);
+                 my $o = substr($leader,0,5);
+                 warn "# in record ", $self->{count}," record length isn't number but: ",dump($o),"\n" unless $o =~ m/^\d+$/;
+                 if ($o > 24) {
+                         seek($self->{fh},$o-24,1) if ($o);
+                 } else {
+                         last;
+                 }
          }
-Line 129 
 Fetch record from database
+Line 156 
 Fetch record from database
    my $hash = $marc->fetch(42);
+ First record number is C<1>
  =cut
  sub fetch {
          my $self = shift;
-         my $rec_nr = shift || return;
+         my $rec_nr = shift;
+         if ( ! $rec_nr ) {
+                 $self->{last_leader} = undef;
+                 return;
+         }
-         my $leader = $self->{leaders}->[$rec_nr - 1];
+         my $leader = $self->{leader}->[$rec_nr - 1];
+         $self->{last_leader} = $leader;
          unless ($leader) {
                  carp "can't find record $rec_nr";
                  return;
-Line 196 
 sub fetch {
+Line 231 
 sub fetch {
                  my $f = substr($fields,$addr,$len);
                  print STDERR "tag/len/addr $tag [$len] $addr: '$f'\n" if ($self->{debug});
-                 if ($row->{$tag}) {
+                 push @{ $row->{$tag} }, $f;
-                         $row->{$tag} .= $f;
-                 } else {
-                         $row->{$tag} = $f;
-                 }
                  my $del = substr($fields,$addr+$len-1,1);
-Line 221 
 sub fetch {
+Line 252 
 sub fetch {
          return $row;
  }
+ =head2 last_leader
+ Returns leader of last record L<fetch>ed
+   print $marc->last_leader;
+ Added in version 0.08 of this module, so if you need it use:
+   use MARC::Fast 0.08;
+ to be sure that it's supported.
+ =cut
+ sub last_leader {
+         my $self = shift;
+         return $self->{last_leader};
+ }
+ =head2 to_hash
+ Read record with specified MFN and convert it to hash
+   my $hash = $marc->to_hash( $mfn, include_subfields => 1, );
+ It has ability to convert characters (using C<hash_filter>) from MARC
+ database before creating structures enabling character re-mapping or quick
+ fix-up of data.
+ This function returns hash which is like this:
+   '200' => [
+              {
+                'i1' => '1',
+                'i2' => ' '
+                'a' => 'Goa',
+                'f' => 'Valdo D\'Arienzo',
+                'e' => 'tipografie e tipografi nel XVI secolo',
+              }
+            ],
+ This method will also create additional field C<000> with MFN.
+ =cut
+ sub to_hash {
+         my $self = shift;
+         my $mfn = shift || confess "need mfn!";
+         my $args = {@_};
+         # init record to include MFN as field 000
+         my $rec = { '000' => [ $mfn ] };
+         my $row = $self->fetch($mfn) || return;
+         foreach my $tag (keys %{$row}) {
+                 foreach my $l (@{$row->{$tag}}) {
+                         # remove end marker
+                         $l =~ s/\x1E$//;
+                         # filter output
+                         $l = $self->{'hash_filter'}->($l, $tag) if ($self->{'hash_filter'});
+                         my $val;
+                         # has identifiers?
+                         ($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\x1F/\x1F/);
+                         my $sf_usage;
+                         my @subfields;
+                         # has subfields?
+                         if ($l =~ m/\x1F/) {
+                                 foreach my $t (split(/\x1F/,$l)) {
+                                         next if (! $t);
+                                         my $f = substr($t,0,1);
+                                         push @subfields, ( $f, $sf_usage->{$f}++ || 0 );
+                                         # repeatable subfiled -- convert it to array
+                                         if ($val->{$f}) {
+                                                 if ( ref($val->{$f}) ne 'ARRAY' ) {
+                                                         $val->{$f} = [ $val->{$f}, $val ];
+                                                 } else {
+                                                         push @{$val->{$f}}, $val;
+                                                 }
+                                         }
+                                         $val->{substr($t,0,1)} = substr($t,1);
+                                 }
+                                 $val->{subfields} = [ @subfields ] if $args->{include_subfields};
+                         } else {
+                                 $val = $l;
+                         }
+                         push @{$rec->{$tag}}, $val;
+                 }
+         }
+         return $rec;
+ }
+ =head2 to_ascii
+   print $marc->to_ascii( 42 );
+ =cut
+ sub to_ascii {
+         my $self = shift;
+         my $mfn = shift || confess "need mfn";
+         my $row = $self->fetch($mfn) || return;
+         my $out;
+         foreach my $f (sort keys %{$row}) {
+                 my $dump = join('', @{ $row->{$f} });
+                 $dump =~ s/\x1e$//;
+                 $dump =~ s/\x1f/\$/g;
+                 $out .= "$f\t$dump\n";
+         }
+         return $out;
+ }
 ;
  __END__
- =head1 BUGS
+ =head1 UTF-8 ENCODING
+ This module does nothing with encoding. But, since MARC format is byte
+ oriented even when using UTF-8 which has variable number of bytes for each
+ character, file is opened in binary mode.
+ As a result, all scalars recturned to perl don't have utf-8 flag. Solution is
+ to use C<hash_filter> and L<Encode> to decode utf-8 encoding like this:
- =head1 SUPPORT
+   use Encode;
+   my $marc = new MARC::Fast(
+         marcdb => 'utf8.marc',
+         hash_filter => sub {
+                 Encode::decode( 'utf-8', $_[0] );
+         },
+   );
+ This will affect C<to_hash>, but C<fetch> will still return binary representation
+ since it doesn't support C<hash_filter>.
  =head1 AUTHOR
-Line 250 
 LICENSE file included with this module.
+Line 424 
 LICENSE file included with this module.
  =head1 SEE ALSO
- perl(1).
+ L<Biblio::Isis>, perl(1).
  =cut

 Legend:



Removed from v.5
 


changed lines


 
Added in v.36
 Legend:



Removed from v.5
 


changed lines


 
Added in v.36
-Removed from v.5
+Added in v.36

	ViewVC Help
Powered by ViewVC 1.1.26