X-Git-Url: http://lists.indexdata.dk/cgi-bin?a=blobdiff_plain;f=robot.tcl;h=4934b9270ce83b8e3d74da1a3fb79c229cac618c;hb=563c2ed58045d7cd4c977af162f13d1fa308b4a5;hp=93c45416276eb1ee0fa5886e89d942e3ba9afcbb;hpb=d8234df96ab8fb03ed71f6358f7211ebe725b495;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index 93c4541..4934b92 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,7 +1,7 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.6 2000/12/07 20:16:11 adam Exp $ +# $Id: robot.tcl,v 1.8 2000/12/10 22:27:48 adam Exp $ # -proc RobotFileNext {area} { +proc RobotFileNext1 {area} { if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -18,7 +18,7 @@ proc RobotFileNext {area} { } foreach n $ns { if {[file isdirectory $n]} { - set sb [RobotFileNext $n] + set sb [RobotFileNext1 $n] if {[string length $sb]} { return $sb } @@ -27,6 +27,45 @@ proc RobotFileNext {area} { return {} } +proc RobotFileWait {} { + global robotSeq + set robotSeq 0 +} + +proc RobotFileNext {area} { + global robotSeq + if {[catch {set ns [glob ${area}/*]}]} { + return {} + } + set off [string length $area] + incr off + + set n [lindex $ns $robotSeq] + if {![string length $n]} { + puts "------------ N E X T R O U N D --------" + set robotSeq -1 + after 30000 RobotFileWait + vwait robotSeq + + set n [lindex $ns $robotSeq] + if {![string length $n]} { + return {} + } + } + incr robotSeq + if {[file isfile $n/robots.txt]} { + puts "ok returning http://[string range $n $off end]/robots.txt" + return http://[string range $n $off end]/robots.txt + } elseif {[file isdirectory $n]} { + set sb [RobotFileNext1 $n] + if {[string length $sb]} { + return $sb + } + } + return {} +} + + proc RobotFileExist {area host path} { set comp [split $area/$host$path /] set l [llength $comp] @@ -70,8 +109,9 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path" + puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { + puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" puts "pwd = $orgPwd" exit 1 @@ -84,6 +124,11 @@ proc RobotFileOpen {area host path {mode w}} { if {[catch {cd ./$d}]} { exec mkdir $d cd ./$d + if {![string compare $area unvisited] && $i == 1 && $mode == "w"} { + set out [open robots.txt w] + puts "creating robots.txt in $d" + close $out + } } } set d [lindex $comp $len] @@ -101,10 +146,12 @@ proc RobotFileOpen {area host path {mode w}} { return $out } -proc RobotRestart {} { +proc RobotRestart {sock} { global URL global robotMoreWork - + + close $sock + after cancel $URL($sock,cancel) while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { @@ -112,13 +159,12 @@ proc RobotRestart {} { } set r [RobotGetUrl $url {}] if {!$r} { - puts "RobotGetUrl returned 0 on url=$url" return } else { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) } } - set robotMoreWork 0 + incr robotMoreWork -1 } proc headSave {url out} { @@ -213,18 +259,28 @@ proc RobotHref {url hrefx hostx pathx} { incr i -1 } } - } + } + regsub -all {~} $path {%7E} path + set ok 1 + if {[info exists URL($host,robots)]} { + foreach l $URL($host,robots) { + if {[string first [lindex $l 1] $path] == 0} { + set ok [lindex $l 0] + break + } + } + } set href "$method://$host$path" - puts "Ref href = $href" - return 1 + puts "Ref href = $href, ok=$ok" + return $ok } -proc Robot401 {url} { +proc RobotError {url code} { global URL - puts "Bad URL $url" + puts "Bad URL $url, $code" set fromurl {} - catch { + if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} { set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] set fromurl [gets $inf] close $inf @@ -232,32 +288,13 @@ proc Robot401 {url} { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url 401" + puts $outf "URL=$url $code" puts $outf "Reference $fromurl" RobotFileClose $outf } } -proc Robot404 {url} { - global URL - - puts "Bad URL $url" - set fromurl {} - catch { - set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r] - set fromurl [gets $inf] - RobotFileClose $inf - } - RobotFileUnlink unvisited $URL($url,host) $URL($url,path) - if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { - set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url 404" - puts $outf "Reference $fromurl" - RobotFileClose $outf - } - } - -proc Robot301 {url tourl} { +proc RobotRedirect {url tourl code} { global URL puts "Redirecting from $url to $tourl" @@ -271,7 +308,7 @@ proc Robot301 {url tourl} { RobotFileUnlink unvisited $URL($url,host) $URL($url,path) if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} { set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)] - puts $outf "URL=$url to $tourl 301" + puts $outf "URL=$url to $tourl $code" puts $outf "Reference $fromurl" RobotFileClose $outf } @@ -279,7 +316,7 @@ proc Robot301 {url tourl} { if {![RobotFileExist unvisited $host $path]} { puts "Mark as unvisited" set outf [RobotFileOpen unvisited $host $path] - puts $outf 301 + puts $outf $code RobotFileClose $outf } } @@ -352,6 +389,38 @@ proc RobotTextHtml {url out} { puts $out "" } +proc RobotsTxt {url} { + global agent URL + + set v URL($URL($url,host),robots) + set section 0 + foreach l [split $URL($url,buf) \n] { + puts $l + if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} { + puts "cmd=$cmd arg=$arg" + switch $cmd { + User-Agent { + if {$section} break + set pat [string tolower $arg]* + set section [string match $pat $agent] + } + Disallow { + if {$section} { + puts "rule [list 0 $arg]" + lappend $v [list 0 $arg] + } + } + Allow { + if {$section} { + puts "rule [list 1 $arg]" + lappend $v [list 1 $arg] + } + } + } + } + } +} + proc RobotTextPlain {url out} { global URL @@ -360,6 +429,10 @@ proc RobotTextPlain {url out} { puts $out $URL($url,buf) puts $out "" puts $out "" + + if {![string compare $URL($url,path) /robots.txt]} { + RobotsTxt $url + } } proc Robot200 {url} { @@ -389,11 +462,13 @@ proc RobotReadContent {url sock} { set buffer [read $sock 16384] set readCount [string length $buffer] - + if {$readCount <= 0} { - close $sock Robot200 $url - RobotRestart + RobotRestart $sock + } elseif {[string first \0 $buffer] >= 0} { + Robot200 $url + RobotRestart $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -403,13 +478,15 @@ proc RobotReadContent {url sock} { proc RobotReadHeader {url sock} { global URL - set buffer [read $sock 2148] + if {[catch {set buffer [read $sock 2148]}]} { + RobotError $url 404 + RobotRestart $sock + } set readCount [string length $buffer] if {$readCount <= 0} { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $sock } else { # puts "Got $readCount bytes" set URL($url,buf) $URL($url,buf)$buffer @@ -434,24 +511,20 @@ proc RobotReadHeader {url sock} { set URL($url,state) skip switch $code { 301 { - Robot301 $url $URL($url,head,location) - close $sock - RobotRestart + RobotRedirect $url $URL($url,head,location) 301 + RobotRestart $sock } 302 { - Robot301 $url $URL($url,head,location) - close $sock - RobotRestart + RobotRedirect $url $URL($url,head,location) 302 + RobotRestart $sock } 404 { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $sock } 401 { - Robot401 $url - close $sock - RobotRestart + RobotError $url 401 + RobotRestart $sock } 200 { if {![info exists URL($url,head,content-type)]} { @@ -465,33 +538,38 @@ proc RobotReadHeader {url sock} { fileevent $sock readable [list RobotReadContent $url $sock] } default { - close $sock Robot200 $url - RobotRestart + RobotRestart $sock } } } default { - Robot404 $url - close $sock - RobotRestart + RobotError $url 404 + RobotRestart $sock } } } } } +proc RobotSockCancel {sock url} { + + puts "RobotSockCancel sock=$sock url=$url" + RobotError $url 401 + RobotRestart $sock +} + proc RobotConnect {url sock} { global URL agent fconfigure $sock -translation {auto crlf} -blocking 0 - puts "Reading $url" fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" puts $sock "User-Agent: $agent" puts $sock "" flush $sock + set URL($sock,cancel) [after 60000 [list RobotSockCancel $sock $url]] } proc RobotNop {} { @@ -530,7 +608,6 @@ if {![llength [info commands htmlSwitch]]} { } } - set agent "zmbot/0.0" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" @@ -538,28 +615,13 @@ if {![catch {set os [exec uname -s -r]}]} { } proc bgerror {m} { + global errorInfo puts "BGERROR $m" -} - -if {0} { - proc RobotRestart {} { - global robotMoreWork - set robotMoreWork 0 - puts "myrestart" - } - set robotMoreWork 1 - set url {http://www.indexdata.dk/zap/} - RobotGetUrl $url {} - while {$robotMoreWork} { - vwait robotMoreWork - } - puts "-----------" - puts $URL($url,buf) - puts "-----------" - exit 1 + puts $errorInfo } set robotMoreWork 0 +set robotSeq 0 set workdir [pwd] if {[llength $argv] < 2} { @@ -569,15 +631,11 @@ if {[llength $argv] < 2} { } set domains [lindex $argv 0] -set site [lindex $argv 1] -if {[string length $site]} { - set robotMoreWork 1 +foreach site [lindex $argv 1] { + incr robotMoreWork if [RobotGetUrl $site {}] { - set robotMoreWork 0 + incr robotMoreWork -1 puts "Couldn't process $site" - } else { - #set x [RobotFileOpen unvisited $site /robots.txt] - #RobotFileClose $x } }