--- trunk/lib/WebPAC/Input.pm 2005/06/25 20:23:23 1 +++ trunk/lib/WebPAC/Input.pm 2005/12/18 22:16:44 289 @@ -3,63 +3,399 @@ use warnings; use strict; +use blib; + +use WebPAC::Common; +use base qw/WebPAC::Common/; +use Text::Iconv; + =head1 NAME -WebPAC::Input - The great new WebPAC::Input! +WebPAC::Input - read different file formats into WebPAC =head1 VERSION -Version 0.01 +Version 0.03 =cut -our $VERSION = '0.01'; +our $VERSION = '0.03'; =head1 SYNOPSIS -Quick summary of what the module does. +This module implements input as database which have fixed and known +I while indexing and single unique numeric identifier for database +position ranging from 1 to I. + +Simply, something that is indexed by unmber from 1 .. I. + +Examples of such databases are CDS/ISIS files, MARC files, lines in +text file, and so on. + +Specific file formats are implemented using low-level interface modules, +located in C namespace which export C, +C and optional C functions. Perhaps a little code snippet. use WebPAC::Input; - my $foo = WebPAC::Input->new(); - ... + my $db = WebPAC::Input->new( + module => 'WebPAC::Input::ISIS', + config => $config, + lookup => $lookup_obj, + low_mem => 1, + ); + + $db->open('/path/to/database'); + print "database size: ",$db->size,"\n"; + while (my $rec = $db->fetch) { + } -=head1 EXPORT -A list of functions that can be exported. You can delete this section -if you don't export anything, such as for a purely object-oriented module. =head1 FUNCTIONS -=head2 function1 +=head2 new + +Create new input database object. + + my $db = new WebPAC::Input( + module => 'WebPAC::Input::MARC', + code_page => 'ISO-8859-2', + low_mem => 1, + ); + +C is low-level file format module. See L and +L. + +Optional parametar C specify application code page (which will be +used internally). This should probably be your terminal encoding, and by +default, it C. + +Default is not to use C options (see L below). + +This function will also call low-level C if it exists with same +parametars. =cut -sub function1 { +sub new { + my $class = shift; + my $self = {@_}; + bless($self, $class); + + my $log = $self->_get_logger; + + $log->logconfess("specify low-level file format module") unless ($self->{module}); + my $module = $self->{module}; + $module =~ s#::#/#g; + $module .= '.pm'; + $log->debug("require low-level module $self->{module} from $module"); + + require $module; + #eval $self->{module} .'->import'; + + # check if required subclasses are implemented + foreach my $subclass (qw/open_db fetch_rec init/) { + my $n = $self->{module} . '::' . $subclass; + if (! defined &{ $n }) { + $log->logwarn("missing implementation of $subclass"); + } else { + $self->{$subclass} = \&{ $n }; + } + } + + if ($self->{init}) { + $log->debug("calling init"); + $self->{init}->($self, @_); + } + + $self->{'code_page'} ||= 'ISO-8859-2'; + + # running with low_mem flag? well, use DBM::Deep then. + if ($self->{'low_mem'}) { + $log->info("running with low_mem which impacts performance (<32 Mb memory usage)"); + + my $db_file = "data.db"; + + if (-e $db_file) { + unlink $db_file or $log->logdie("can't remove '$db_file' from last run"); + $log->debug("removed '$db_file' from last run"); + } + + require DBM::Deep; + + my $db = new DBM::Deep $db_file; + + $log->logdie("DBM::Deep error: $!") unless ($db); + + if ($db->error()) { + $log->logdie("can't open '$db_file' under low_mem: ",$db->error()); + } else { + $log->debug("using file '$db_file' for DBM::Deep"); + } + + $self->{'db'} = $db; + } + + $self ? return $self : return undef; } -=head2 function2 +=head2 open + +This function will read whole database in memory and produce lookups. + + $input->open( + path => '/path/to/database/file', + code_page => '852', + limit => 500, + offset => 6000, + lookup => $lookup_obj, + ); + +By default, C is assumed to be C<852>. + +C is optional parametar to position at some offset before reading from database. + +C is optional parametar to read just C records from database + +Returns size of database, regardless of C and C +parametars, see also C. =cut -sub function2 { +sub open { + my $self = shift; + my $arg = {@_}; + + my $log = $self->_get_logger(); + + $log->logcroak("need path") if (! $arg->{'path'}); + my $code_page = $arg->{'code_page'} || '852'; + + # store data in object + $self->{'code_page'} = $code_page; + foreach my $v (qw/path offset limit/) { + $self->{$v} = $arg->{$v} if ($arg->{$v}); + } + + # create Text::Iconv object + $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'}); + + my ($db, $size) = $self->{open_db}->( $self, + path => $arg->{path}, + ); + + unless ($db) { + $log->logwarn("can't open database $arg->{path}, skipping..."); + return; + } + + unless ($size) { + $log->logwarn("no records in database $arg->{path}, skipping..."); + return; + } + + my $offset = 1; + my $limit = $size; + + if (my $s = $self->{offset}) { + $log->info("skipping to MFN $s"); + $offset = $s; + } else { + $self->{offset} = $offset; + } + + if ($self->{limit}) { + $log->info("limiting to ",$self->{limit}," records"); + $limit = $offset + $self->{limit} - 1; + $limit = $size if ($limit > $size); + } + + # store size for later + $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0; + + $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}"); + + # read database + for (my $pos = $offset; $pos <= $limit; $pos++) { + + $log->debug("position: $pos\n"); + + my $rec = $self->{fetch_rec}->($self, $db, $pos ); + + if (! $rec) { + $log->warn("record $pos empty? skipping..."); + next; + } + + # store + if ($self->{low_mem}) { + $self->{db}->put($pos, $rec); + } else { + $self->{data}->{$pos} = $rec; + } + + # create lookup + $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'}); + + $self->progress_bar($pos,$limit); + + } + + $self->{pos} = -1; + $self->{last_pcnt} = 0; + + # store max mfn and return it. + $self->{max_pos} = $limit; + $log->debug("max_pos: $limit"); + + return $size; } -=head1 AUTHOR +=head2 fetch -Dobrica Pavlinusic, C<< >> +Fetch next record from database. It will also displays progress bar. + + my $rec = $isis->fetch; + +Record from this function should probably go to C for +normalisation. + +=cut -=head1 BUGS +sub fetch { + my $self = shift; -Please report any bugs or feature requests to -C, or through the web interface at -L. -I will be notified, and then you'll automatically be notified of progress on -your bug as I make changes. + my $log = $self->_get_logger(); -=head1 ACKNOWLEDGEMENTS + $log->logconfess("it seems that you didn't load database!") unless ($self->{pos}); + + if ($self->{pos} == -1) { + $self->{pos} = $self->{offset}; + } else { + $self->{pos}++; + } + + my $mfn = $self->{pos}; + + if ($mfn > $self->{max_pos}) { + $self->{pos} = $self->{max_pos}; + $log->debug("at EOF"); + return; + } + + $self->progress_bar($mfn,$self->{max_pos}); + + my $rec; + + if ($self->{low_mem}) { + $rec = $self->{db}->get($mfn); + } else { + $rec = $self->{data}->{$mfn}; + } + + $rec ||= 0E0; +} + +=head2 pos + +Returns current record number (MFN). + + print $isis->pos; + +First record in database has position 1. + +=cut + +sub pos { + my $self = shift; + return $self->{pos}; +} + + +=head2 size + +Returns number of records in database + + print $isis->size; + +Result from this function can be used to loop through all records + + foreach my $mfn ( 1 ... $isis->size ) { ... } + +because it takes into account C and C. + +=cut + +sub size { + my $self = shift; + return $self->{size}; +} + +=head2 seek + +Seek to specified MFN in file. + + $isis->seek(42); + +First record in database has position 1. + +=cut + +sub seek { + my $self = shift; + my $pos = shift || return; + + my $log = $self->_get_logger(); + + if ($pos < 1) { + $log->warn("seek before first record"); + $pos = 1; + } elsif ($pos > $self->{max_pos}) { + $log->warn("seek beyond last record"); + $pos = $self->{max_pos}; + } + + return $self->{pos} = (($pos - 1) || -1); +} + + +=head1 MEMORY USAGE + +C options is double-edged sword. If enabled, WebPAC +will run on memory constraint machines (which doesn't have enough +physical RAM to create memory structure for whole source database). + +If your machine has 512Mb or more of RAM and database is around 10000 records, +memory shouldn't be an issue. If you don't have enough physical RAM, you +might consider using virtual memory (if your operating system is handling it +well, like on FreeBSD or Linux) instead of dropping to L to handle +parsed structure of ISIS database (this is what C option does). + +Hitting swap at end of reading source database is probably o.k. However, +hitting swap before 90% will dramatically decrease performance and you will +be better off with C and using rest of availble memory for +operating system disk cache (Linux is particuallary good about this). +However, every access to database record will require disk access, so +generation phase will be slower 10-100 times. + +Parsed structures are essential - you just have option to trade RAM memory +(which is fast) for disk space (which is slow). Be sure to have planty of +disk space if you are using C and thus L. + +However, when WebPAC is running on desktop machines (or laptops :-), it's +highly undesireable for system to start swapping. Using C option can +reduce WecPAC memory usage to around 64Mb for same database with lookup +fields and sorted indexes which stay in RAM. Performance will suffer, but +memory usage will really be minimal. It might be also more confortable to +run WebPAC reniced on those machines. + + +=head1 AUTHOR + +Dobrica Pavlinusic, C<< >> =head1 COPYRIGHT & LICENSE