X-Git-Url: http://lists.indexdata.dk/cgi-bin?a=blobdiff_plain;f=robot.tcl;h=e67a9b84ab670d6df254e15143d23c9a9e80079d;hb=7a0cfd1ab65b62d7f3ac4a23a0388267a8afeeea;hp=c9388bcd3f2ddf24a95df708c41265daf9cb8532;hpb=0c2ddebb45112314921d3da60f466622b7e53845;p=tclrobot.git
diff --git a/robot.tcl b/robot.tcl
index c9388bc..e67a9b8 100755
--- a/robot.tcl
+++ b/robot.tcl
@@ -1,5 +1,5 @@
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.20 2001/06/29 22:25:55 adam Exp $
+# $Id: robot.tcl,v 1.34 2002/06/18 19:57:53 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
@@ -50,7 +50,9 @@ proc RobotReadRecord {inf fromurlx distancex} {
}
proc RobotFileNext {area} {
- global robotSeq global idleTime ns
+ global robotSeq
+ global idletime ns
+ global status
# puts "RobotFileNext robotSeq=$robotSeq"
if {$robotSeq < 0} {
@@ -67,7 +69,9 @@ proc RobotFileNext {area} {
if {![string length $n]} {
set robotSeq -1
flush stdout
- puts "------------ N E X T R O U N D --------"
+ set statusfile [open status w]
+ puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+ close $statusfile
return wait
}
incr robotSeq
@@ -87,18 +91,25 @@ proc RobotFileNext {area} {
proc RobotFileExist {area host path} {
- # puts "RobotFileExist begin area=$area host=$host path=$path"
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "RobotFileExist begin area=$area host=$host path=$path"
+ }
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- # puts "RobotFileExist end npath=$npath"
+ if {$debuglevel > 3} {
+ puts "RobotFileExist end npath=$npath"
+ }
return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
+ global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
set lpath [split $path /]
@@ -109,10 +120,12 @@ proc RobotFileUnlink {area host path} {
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
# puts "npath=$npath"
set comp [split $npath /]
+ if {[catch {exec rm [join $comp /]}]} return
+
set l [llength $comp]
incr l -1
- if {[catch {exec rm [join $comp /]}]} return
incr l -1
+ incr status($area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
@@ -130,11 +143,15 @@ proc RobotFileClose {out} {
proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
+ global status
+ global debuglevel
if {![info exists workdir]} {
return stdout
}
- #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ if {$debuglevel > 3} {
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
@@ -154,22 +171,24 @@ proc RobotFileOpen {area host path {mode w}} {
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open frobots.txt w]
- puts "creating robots.txt in $d"
- close $out
+ if {[string compare $path /robots.txt]} {
+ set out [open frobots.txt w]
+ puts "creating robots.txt in $d"
+ close $out
+ incr status(unvisited)
+ }
}
}
}
set d [lindex $comp $len]
if {[string length $d]} {
- if {[file isdirectory $d]} {
- set out [open $d/f $mode]
- } else {
- set out [open f$d $mode]
- }
+ set out [open f$d $mode]
} else {
set out [open f $mode]
}
+ if {$mode == "w"} {
+ incr status($area)
+ }
cd $orgPwd
return $out
}
@@ -201,7 +220,7 @@ proc RobotRestart {url sock} {
proc RobotStart {} {
global URL
- global robotsRunning robotsMax idleTime
+ global robotsRunning robotsMax idletime
# puts "RobotStart"
while {1} {
@@ -211,7 +230,7 @@ proc RobotStart {} {
}
incr robotsRunning
if {[string compare $url wait] == 0} {
- after $idleTime RobotRR
+ after $idletime RobotRR
return
}
set r [RobotGetUrl $url {}]
@@ -254,12 +273,14 @@ proc headSave {url out} {
}
proc RobotHref {url hrefx hostx pathx} {
- global URL domains
+ global URL domains debuglevel
upvar $hrefx href
upvar $hostx host
upvar $pathx path
- puts "Ref url = $url href=$href"
+ if {$debuglevel > 1} {
+ puts "Ref input url = $url href=$href"
+ }
if {[string first { } $href] >= 0} {
return 0
@@ -267,12 +288,11 @@ proc RobotHref {url hrefx hostx pathx} {
if {[string length $href] > 256} {
return 0
}
- if {[string first {?} $href] >= 0} {
- return 0
- }
- if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
- return 0
- }
+
+# Skip pages that have ? in them
+# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+# return 0
+# }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
@@ -308,7 +328,11 @@ proc RobotHref {url hrefx hostx pathx} {
}
if {[string first / $surl]} {
# relative path
- regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set curpath $URL($url,path)
+ if {[info exists URL($url,bpath)]} {
+ set curpath $URL($url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {[expr $l >= 0]} {
set surl [string range $dpart 0 $l]$surl
@@ -322,7 +346,7 @@ proc RobotHref {url hrefx hostx pathx} {
foreach c $surllist {
switch -- $c {
.. {
- if {$pathl > 0} {
+ if {$pathl > 1} {
incr pathl -2
set path [lrange $path 0 $pathl]
incr pathl
@@ -337,21 +361,26 @@ proc RobotHref {url hrefx hostx pathx} {
}
}
}
- if {$pathl} {
- set path [join $path /]
- } else {
- set path ""
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
}
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
- puts "Ref href = $href"
- return 1
+
+ if {$debuglevel > 1} {
+ puts "Ref result = $href"
+ }
+ return [checkrule url $href]
}
proc RobotError {url code} {
global URL
- puts "Bad URL $url, $code"
+ puts "Bad URL $url (code $code)"
set fromurl {}
set distance -1
if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
@@ -416,131 +445,144 @@ proc RobotRedirect {url tourl code} {
}
}
+proc link {url out href body distance} {
+ global URL maxdistance
+ if {[expr $distance > $maxdistance]} return
+
+ if {![RobotHref $url href host path]} return
+
+ puts $out "