/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 179 by dpavlin, Fri Aug 25 11:59:04 2006 UTC revision 181 by dpavlin, Sat Aug 26 22:33:34 2006 UTC
# Line 61  my $start_t = time(); Line 61  my $start_t = time();
61  my $filter;  my $filter;
62  foreach my $f (qw/pdftotext pstotext/) {  foreach my $f (qw/pdftotext pstotext/) {
63          my $w = which($f);          my $w = which($f);
64          if ($f) {          if ($w) {
65                  $filter->{$f} = $w;                  $filter->{$f} = $w;
66                  print STDERR "using $f filter at $w\n" if ($verbose);                  print STDERR "using $f filter at $w\n" if ($verbose);
67          }          }
# Line 237  sub dump_contents { Line 237  sub dump_contents {
237    
238          if ($contents) {          if ($contents) {
239                  # html2text                  # html2text
240                    $contents =~ s#<script.*?</script>##gis;
241                  $contents =~ s#<[^>]+/*>##gs;                  $contents =~ s#<[^>]+/*>##gs;
242                  $contents =~ s#\s\s+# #gs;                  $contents =~ s#\s\s+# #gs;
243    

Legend:
Removed from v.179  
changed lines
  Added in v.181

  ViewVC Help
Powered by ViewVC 1.1.26