summaryrefslogtreecommitdiff
path: root/www/checkbot/patches/patch-aa
blob: 7e949a36a24dca638b11ec1096cb7c8462d74fb9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
$NetBSD: patch-aa,v 1.4 2001/08/09 12:35:36 abs Exp $

--- checkbot.pl.orig	Sun Apr 15 20:34:30 2001
+++ checkbot.pl
@@ -47,8 +47,9 @@
 
 checkbot [B<--debug>] [B<--help>] [B<--verbose>] [B<--url> start URL] 
          [B<--match> match string] [B<--exclude> exclude string]
-         [B<--proxy> proxy URL] [B<--internal-only>]
-         [B<--ignore> ignore string] [B<-file> file name]
+         [B<--skip> skip string] [B<--ignore> ignore string]
+         [B<--proxy> proxy URL] [B<--internal-only>] [B<--match-url-base>]
+         [B<--file> file name]
          [B<--style> style file URL]
          [B<--mailto> email address]
          [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout]
@@ -90,6 +91,11 @@
 
 The I<match string> can be a perl regular expression.
 
+=item --match-url-base
+
+This option causes checkbot to use the site component of each url when
+determining which pages are local.
+
 =item --exclude <exclude string>
 
 URLs matching the I<exclude string> are considered to be external,
@@ -97,6 +103,12 @@
 
 The I<exclude string> can be a perl regular expression.
 
+=item --skip <skip string>
+
+URLs matching the I<skip string> are not processed.
+
+The I<skip string> can be a perl regular expression.
+
 =item --ignore <ignore string>
 
 If a URL has an error, and matches the I<ignore string>, its error
@@ -262,7 +274,7 @@
 
   # Get command-line arguments
   use Getopt::Long;
-  my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual));
+  my $result = GetOptions(qw(debug help verbose url=s match=s exclude|x=s skip|x=s file=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=i timeout=i interval=i dontwarn=s enable-virtual match-url-base));
 
   # Handle arguments, some are mandatory, some have defaults
   &print_help if (($main::opt_help && $main::opt_help) 
@@ -273,6 +285,7 @@
   $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval;
   $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn;
   $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual;
+  $main::opt_match_url_base = 0 unless defined $main::opt_match_url_base;
   # The default for opt_match will be set later, because we might want
   # to muck with opt_url first.
 
@@ -353,7 +366,11 @@
     my @matchurls;
     my $matchurl;
     foreach $matchurl (@starturls) {
-      push(@matchurls, quotemeta $matchurl);
+      $_ = $matchurl;
+      if ($main::opt_match_url_base && m#^(\w+://[^/]+/)#) {
+	$_ = $1;
+      }
+      push(@matchurls, quotemeta $_);
     }
     $main::opt_match = '(' . join('|', @matchurls) . ')';
     print STDERR "--match defaults to $main::opt_match\n" if $main::opt_verbose;
@@ -781,7 +798,9 @@
     print OUT "<tr><th align=left>--url</th><td>Start URL(s)</td><td>",
               join(',', @starturls), "</td></tr>\n";
     print OUT "<tr><th align=left>--match</th><td>Match regular expression</td><td>$main::opt_match</td></tr>\n";
+    print OUT "<tr><th align=left>--match-url-base</th><td>Match base of each url</td><td>$main::opt_match_url_base</td></tr>\n" if defined $main::opt_match_url_base;
     print OUT "<tr><th align=left>--exclude</th><td>Exclude regular expression</td><td>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude;
+    print OUT "<tr><th align=left>--skip</th><td>Skip regular expression</td><td>$main::opt_skip</td></tr>\n" if defined $main::opt_skip;
     print OUT "<tr><th align=left>--ignore</th><td>Ignore regular expression</td><td>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore;
     print OUT "<tr><th align=left>--dontwarn</th><td>Don't warn for these codes</td><td>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx';
     print OUT "<tr><th align=left>--enable-virtual</th><td>Use virtual names only</td><td>yes</td></tr>\n" if $main::opt_enable_virtual;
@@ -923,7 +942,7 @@
 	  add_to_queue($url, $response->base);
 	  $doc_new++;
 	}
-      } else {
+      } elsif (!defined $main::opt_skip || $url !~ /$main::opt_skip/o) {
 	# Add this as an external link if we can check the protocol later
 	if ($url =~ /^(http|ftp):/o) {
 	  print EXTERNAL $url . "|" . $response->base . "\n";
@@ -947,8 +966,12 @@
 sub add_to_queue {
   my ($url, $parent) = @_;
 
-  print QUEUE $url . '|' . $parent . "\n";
-  $main::st_int[$main::TODO]++;
+  if (defined $main::opt_skip && $url =~ /$main::opt_skip/o) {
+    print STDERR "Skip    $url\n" if $main::opt_verbose;
+  } else {
+    print QUEUE $url . '|' . $parent . "\n";
+    $main::st_int[$main::TODO]++;
+  }
 }
 
 sub print_server {
@@ -1142,6 +1165,7 @@
   print "  --match match      Check pages only if URL matches `match'\n";
   print "                     If no match is given, the start URL is used as a match\n";
   print "  --exclude exclude  Exclude pages if the URL matches 'exclude'\n";
+  print "  --skip skip        Do not process pages if the URL matches 'skip'\n";
   print "  --ignore ignore    Do not list error messages for pages that the\n";
   print "                     URL matches 'ignore'\n";
   print "  --file file        Write results to file, default is checkbot.html\n";
@@ -1154,8 +1178,9 @@
   print "  --interval seconds Maximum time interval between updates (default 10800)\n";
   print "  --dontwarn codes   Do not write warnings for these HTTP response codes\n";
   print "  --enable-virtual   Use only virtual names, not IP numbers for servers\n";
+  print "  --match-url-base   Use the site part of the url in --match\n";
   print "\n";
-  print "Options --match, --exclude, and --ignore can take a perl regular expression\nas their argument\n\n";
+  print "Options --match, --exclude, --skip, and --ignore can take a perl regular\nexpression as their argument\n\n";
   print "Use 'perldoc checkbot' for more verbose documentation.\n\n";
   print "Checkbot WWW page     : http://degraaff.org/checkbot/\n";
   print "Mail bugs and problems: checkbot\@degraaff.org\n";