/[Search-Estraier]/trunk/scripts/est-spider
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /trunk/scripts/est-spider

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 77 by dpavlin, Mon Jan 16 21:34:14 2006 UTC revision 89 by dpavlin, Wed Jan 25 23:38:57 2006 UTC
# Line 3  use strict; Line 3  use strict;
3  use File::Find;  use File::Find;
4  use Getopt::Long;  use Getopt::Long;
5  use File::Which;  use File::Which;
 use HyperEstraier;  
6  use Search::Estraier;  use Search::Estraier;
7  use Text::Iconv;  use Text::Iconv;
8  #use File::MMagic;  #use File::MMagic;
9  use File::MMagic::XS qw/:compat/;  use File::MMagic::XS qw/:compat/;
10    
 # do we use Node API?  
 my $node_url;  
   
11  my $collection;         # name which will be inserted  my $collection;         # name which will be inserted
12  my $path_add;           # add additional info in path  my $path_add;           # add additional info in path
13  my $verbose;  my $verbose;
# Line 20  my $exclude; Line 16  my $exclude;
16  #$verbose = 1;  #$verbose = 1;
17  my $debug = 0;  my $debug = 0;
18  my $force = 0;  my $force = 0;
 my $native = 0;  
19    
20  my $result = GetOptions(  my $result = GetOptions(
21          "collection=s" => \$collection,          "collection=s" => \$collection,
# Line 28  my $result = GetOptions( Line 23  my $result = GetOptions(
23          "verbose!" => \$verbose,          "verbose!" => \$verbose,
24          "debug!" => \$debug,          "debug!" => \$debug,
25          "exclude=s" => \$exclude,          "exclude=s" => \$exclude,
         "node=s" => \$node_url,  
26          "force!" => \$force,          "force!" => \$force,
         "native!" => \$native,  
27  );  );
28    
29  my $dir = shift @ARGV || die "usage: $0 [dir]";  my ($node_url,$dir) = @ARGV;
30    
31    die <<"_END_OF_USAGE_" if (! $node_url || ! $dir);
32    usage: $0 http://localhost:1978/node/my_dir /path/to/directory
33    
34    options:
35            --collection="name of collection"
36            --path=/path/to/add/at/end
37            --exclude=regex_to_exclude
38            --verbose
39            --force
40            --debug
41    _END_OF_USAGE_
42    
43  if (! -e $dir) {  if (! -e $dir) {
44          warn "directory $dir doesn't exist, skipping\n";          warn "directory $dir doesn't exist, skipping\n";
# Line 56  select(STDOUT); $|=1; Line 61  select(STDOUT); $|=1;
61    
62  print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);  print STDERR "using $pdftotext to convert pdf into html\n" if ($pdftotext && $verbose);
63    
64  my $db;  my $db = new Search::Estraier::Node;
65  if ($node_url) {  $db->set_url($node_url);
66          if ($native) {  $db->set_auth('admin', 'admin');
                 $db = HyperEstraier::Node->new($node_url);  
         } else {  
                 $db = new Search::Estraier::Node;  
                 $db->set_url($node_url);  
         }  
         $db->set_auth('admin', 'admin');  
 } else {  
         # open the database  
         $db = HyperEstraier::Database->new();  
         $db->open('/tmp/casket', $HyperEstraier::Database::DBWRITER | $HyperEstraier::Database::DBCREAT);  
   
         sub signal {  
                 my($sig) = @_;  
                 print "\nCaught a SIG$sig--syncing database and shutting down\n";  
                 $db->sync();  
                 exit(0);  
         }  
   
         $SIG{'INT'}  = \&signal;  
         $SIG{'QUIT'} = \&signal;  
 }  
67    
68  find({ wanted => \&file,  find({ wanted => \&file,
69          follow => 1,          follow => 1,
# Line 87  find({ wanted => \&file, Line 71  find({ wanted => \&file,
71          no_chdir => 1,          no_chdir => 1,
72  }, $dir);  }, $dir);
73    
 unless ($node_url) {  
         print "--- sync\n";  
         $db->sync();  
74    
         print "--- optimize...\n";  
         $db->optimize(0);  
 }  
75  exit;  exit;
76    
77  sub dump_contents($$$$) {  sub dump_contents($$$$) {
# Line 112  sub dump_contents($$$$) { Line 90  sub dump_contents($$$$) {
90          print STDERR " [$size]" if ($verbose);          print STDERR " [$size]" if ($verbose);
91    
92          # create a document object          # create a document object
93          my $doc;          my $doc = new Search::Estraier::Document;
         if ($native) {  
                 $doc = HyperEstraier::Document->new;  
         } else {  
                 $doc = new Search::Estraier::Document;  
         }  
94    
95          my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);          my $title = $1 if ($contents =~ m#<title>(.+?)</title>#is);
96    
# Line 141  sub dump_contents($$$$) { Line 114  sub dump_contents($$$$) {
114  #       print $doc->dump_draft if ($verbose);  #       print $doc->dump_draft if ($verbose);
115    
116          # register the document object to the database          # register the document object to the database
117          if ($node_url) {          $db->put_doc($doc);
                 $db->put_doc($doc);  
         } else {  
                 $db->put_doc($doc, $HyperEstraier::Database::PDCLEAN);  
         }  
118    
119  }  }
120    

Legend:
Removed from v.77  
changed lines
  Added in v.89

  ViewVC Help
Powered by ViewVC 1.1.26