#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.23 2001/10/31 08:51:49 adam Exp $
+# $Id: robot.tcl,v 1.26 2001/11/08 13:49:06 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
foreach c $surllist {
switch -- $c {
.. {
- if {$pathl > 0} {
+ if {$pathl > 1} {
incr pathl -2
set path [lrange $path 0 $pathl]
incr pathl
}
}
}
- if {$pathl} {
- set path [join $path /]
- } else {
- set path ""
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
}
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
}
puts $out {></meta>}
} body {
- regsub -all -nocase {<script([^<]|(<!.*>))*</script>} $body {} abody
- regsub -all {<[^\>]+>} $abody {} nbody
+ regsub -all {<!--[^-]*->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+ regsub -all {<[^\>]+>} $bbody {} nbody
puts $out "<documentcontent>"
puts $out $nbody
puts $out "</documentcontent>"
RobotWriteMetadata $url $out
RobotFileClose $out
- if {[file isdirectory flat]} {
- regsub -all {/} $URL($url,hostport).$URL($url,path) {.} fname
- set out [open "flat/$fname" w]
- RobotWriteMetadata $url $out
- close $out
- }
RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
}
set workdir [pwd]
set idletime 60000
set acceptLanguage {}
+set debuglevel 0
-set i 0
-set l [llength $argv]
-
-if {$l < 2} {
- puts {tclrobot: usage:}
- puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
- puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
- exit 1
-}
# Rules: allow, deny, url
-set debuglevel 0
proc checkrule {type this} {
global alrules
# consider type
if {[lindex $l 1] != $type} continue
# consider mask (! negates)
- set mask [lindex $l 2]
- if {[string index $mask 0] == "!"} {
- set mask [string range $mask 1 end]
- if {[string match $mask $this]} continue
- } else {
- if {![string match $mask $this]} continue
+ set masks [lindex $l 2]
+ set ok 0
+ foreach mask $masks {
+ if {$debuglevel > 4} {
+ puts "consider single mask $mask"
+ }
+ if {[string index $mask 0] == "!"} {
+ set mask [string range $mask 1 end]
+ if {[string match $mask $this]} continue
+ } else {
+ if {![string match $mask $this]} continue
+ }
+ set ok 1
+ }
+ if {$debuglevel > 4} {
+ puts "ok = $ok"
}
+ if {!$ok} continue
# OK, we have a match
if {[lindex $l 0] == "allow"} {
if {$debuglevel > 3} {
- puts "CHECKRULE MATH OK"
+ puts "CHECKRULE MATCH OK"
}
return 1
} else {
}
}
if {$debuglevel > 3} {
- puts "CHECKRULE MATH OK"
+ puts "CHECKRULE MATCH OK"
}
return 1
}
# Parse options
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage:}
+ puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+ exit 1
+}
while {$i < $l} {
set arg [lindex $argv $i]
switch -glob -- $arg {
RobotStart
+
while {$robotsRunning} {
vwait robotsRunning
}