X-Git-Url: http://lists.indexdata.dk/cgi-bin?a=blobdiff_plain;f=robot.tcl;h=4a30d17fa2f3040318522246c19a9249d9af67ae;hb=347b3d01c32f71e2dc2a85fe05b53adbc5e33871;hp=c7d85c47a68dfe22d74f029269b06a90de19b5de;hpb=e80b5c034f20b43530880ba530a8e1de06646726;p=tclrobot.git diff --git a/robot.tcl b/robot.tcl index c7d85c4..4a30d17 100755 --- a/robot.tcl +++ b/robot.tcl @@ -1,8 +1,8 @@ #!/usr/bin/tclsh -# $Id: robot.tcl,v 1.13 2001/02/26 22:51:51 adam Exp $ +# $Id: robot.tcl,v 1.22 2001/10/30 08:29:54 adam Exp $ # proc RobotFileNext1 {area lead} { - puts "RobotFileNext1 area=$area lead=$lead" + # puts "RobotFileNext1 area=$area lead=$lead" if {[catch {set ns [glob ${area}/*]}]} { return {} } @@ -43,16 +43,16 @@ proc RobotReadRecord {inf fromurlx distancex} { gets $inf gets $inf set distance [string trim [gets $inf]] - puts "got distance = $distance" + # puts "got distance = $distance" gets $inf gets $inf set fromurl [string trim [gets $inf]] } proc RobotFileNext {area} { - global robotSeq global idleTime ns + global robotSeq global idletime ns - puts "RobotFileNext robotSeq=$robotSeq" + # puts "RobotFileNext robotSeq=$robotSeq" if {$robotSeq < 0} { return {} } @@ -67,7 +67,7 @@ proc RobotFileNext {area} { if {![string length $n]} { set robotSeq -1 flush stdout - puts "------------ N E X T R O U N D --------" + puts "Round robin" return wait } incr robotSeq @@ -87,27 +87,27 @@ proc RobotFileNext {area} { proc RobotFileExist {area host path} { - puts "RobotFileExist begin area=$area host=$host path=$path" + # puts "RobotFileExist begin area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "RobotFileExist end npath=$npath" + # puts "RobotFileExist end npath=$npath" return [file exists $npath] } proc RobotFileUnlink {area host path} { - puts "RobotFileUnlink begin" - puts "area=$area host=$host path=$path" + # puts "RobotFileUnlink begin" + # puts "area=$area host=$host path=$path" set lpath [split $path /] set l [llength $lpath] incr l -1 set t [lindex $lpath $l] incr l -1 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t - puts "npath=$npath" + # puts "npath=$npath" set comp [split $npath /] set l [llength $comp] incr l -1 @@ -118,7 +118,7 @@ proc RobotFileUnlink {area host path} { if {![catch {glob $path/*}]} return exec rmdir ./$path } - puts "RobotFileUnlink end" + # puts "RobotFileUnlink end" } proc RobotFileClose {out} { @@ -134,7 +134,7 @@ proc RobotFileOpen {area host path {mode w}} { if {![info exists workdir]} { return stdout } - puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" + #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" if {[string compare $orgPwd $workdir]} { puts "ooops. RobotFileOpen failed" puts "workdir = $workdir" @@ -201,9 +201,9 @@ proc RobotRestart {url sock} { proc RobotStart {} { global URL - global robotsRunning robotsMax idleTime + global robotsRunning robotsMax idletime - puts "RobotStart" + # puts "RobotStart" while {1} { set url [RobotFileNext unvisited] if {![string length $url]} { @@ -211,7 +211,7 @@ proc RobotStart {} { } incr robotsRunning if {[string compare $url wait] == 0} { - after $idleTime RobotRR + after $idletime RobotRR return } set r [RobotGetUrl $url {}] @@ -254,12 +254,14 @@ proc headSave {url out} { } proc RobotHref {url hrefx hostx pathx} { - global URL domains + global URL domains debuglevel upvar $hrefx href upvar $hostx host upvar $pathx path - puts "Ref url = $url href=$href" + if {$debuglevel > 1} { + puts "Ref input url = $url href=$href" + } if {[string first { } $href] >= 0} { return 0 @@ -267,6 +269,9 @@ proc RobotHref {url hrefx hostx pathx} { if {[string length $href] > 256} { return 0 } + if {[string first {?} $href] >= 0} { + return 0 + } if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { return 0 } @@ -313,38 +318,45 @@ proc RobotHref {url hrefx hostx pathx} { set surl $dpart/$surl } } - set c [split $surl /] - set i [llength $c] - incr i -1 - set path [lindex $c $i] - incr i -1 - while {$i >= 0} { - switch -- [lindex $c $i] { + set surllist [split $surl /] + catch {unset path} + set pathl 0 + foreach c $surllist { + switch -- $c { .. { - incr i -2 - if {$i < 0} { - set i 0 + if {$pathl > 0} { + incr pathl -2 + set path [lrange $path 0 $pathl] + incr pathl } } - . { - incr i -1 - } - default { - set path [lindex $c $i]/$path - incr i -1 + . { + + } + default { + incr pathl + lappend path $c } } } + if {$pathl} { + set path [join $path /] + } else { + set path "" + } regsub -all {~} $path {%7E} path set href "$method://$host$path" - puts "Ref href = $href" - return 1 + + if {$debuglevel > 1} { + puts "Ref result = $href" + } + return [checkrule url $href] } proc RobotError {url code} { global URL - puts "Bad URL $url, $code" + puts "Bad URL $url (code $code)" set fromurl {} set distance -1 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} { @@ -410,10 +422,10 @@ proc RobotRedirect {url tourl code} { } proc RobotTextHtml {url out} { - global URL maxDistance + global URL maxdistance set distance 0 - if {$maxDistance < 1000 && [info exists URL($url,dist)]} { + if {$maxdistance < 1000 && [info exists URL($url,dist)]} { set distance [expr $URL($url,dist) + 1] } htmlSwitch $URL($url,buf) \ @@ -429,7 +441,7 @@ proc RobotTextHtml {url out} { } puts $out {>} } body { - regsub -all -nocase {} $body {} abody + regsub -all -nocase {))*} $body {} abody regsub -all {<[^\>]+>} $abody {} nbody puts $out "" puts $out $nbody @@ -439,7 +451,7 @@ proc RobotTextHtml {url out} { puts "no href" continue } - if {[expr $distance <= $maxDistance]} { + if {[expr $distance <= $maxdistance]} { set href [string trim $parm(href)] if {![RobotHref $url href host path]} continue @@ -484,6 +496,56 @@ proc RobotTextHtml {url out} { } } } + } -nonest area { + if {![info exists parm(href)]} { + puts "no href" + continue + } + if {[expr $distance <= $maxdistance]} { + set href [string trim $parm(href)] + if {![RobotHref $url href host path]} continue + + puts $out "" + puts $out "$href" + puts $out "" + puts $out "" + + if {![RobotFileExist visited $host $path]} { + set olddistance 1000 + if {![RobotFileExist bad $host $path]} { + if {[RobotFileExist unvisited $host $path]} { + set inf [RobotFileOpen unvisited $host $path r] + RobotReadRecord $inf oldurl olddistance + RobotFileClose $inf + } + } else { + set olddistance 0 + } + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } elseif {[string compare $href $url]} { + set inf [RobotFileOpen visited $host $path r] + RobotReadRecord $inf xurl olddistance + close $inf + if {[string length $olddistance] == 0} { + set olddistance 1000 + } + if {[expr $distance < $olddistance]} { + puts "OK remarking url=$url href=$href" + puts "olddistance = $olddistance" + puts "newdistance = $distance" + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf $url $distance + RobotFileClose $outf + } + } + } } } @@ -497,21 +559,21 @@ proc RobotsTxt0 {v buf} { global URL agent set section 0 foreach l [split $buf \n] { - if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} { + if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { puts "cmd=$cmd arg=$arg" - switch $cmd { - User-Agent { + switch -- [string tolower $cmd] { + user-agent { if {$section} break set pat [string tolower $arg]* set section [string match $pat $agent] } - Disallow { + disallow { if {$section} { puts "rule [list 0 $arg]" lappend $v [list 0 $arg] } } - Allow { + allow { if {$section} { puts "rule [list 1 $arg]" lappend $v [list 1 $arg] @@ -526,7 +588,8 @@ proc RobotTextPlain {url out} { global URL puts $out "" - puts $out $URL($url,buf) + regsub -all {<} $URL($url,buf) {\<} content + puts $out $content puts $out "" if {![string compare $URL($url,path) /robots.txt]} { @@ -537,6 +600,15 @@ proc RobotTextPlain {url out} { proc Robot200 {url} { global URL domains + set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)] + puts -nonewline $out $URL($url,buf) + RobotFileClose $out + + if {![checkrule mime $URL($url,head,content-type)]} { + RobotError $url mimedeny + return + } + set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)] puts $out "" @@ -553,19 +625,19 @@ proc Robot200 {url} { headSave $url $out puts "Parsing $url distance=$distance" switch $URL($url,head,content-type) { - text/html { - if {[string length $distance]} { - RobotTextHtml $url $out - } - } - text/plain { - RobotTextPlain $url $out - } - application/pdf { - set pdff [open test.pdf w] - puts -nonewline $pdff $URL($url,buf) - close $pdff - } + text/html { + if {[string length $distance]} { + RobotTextHtml $url $out + } + } + text/plain { + RobotTextPlain $url $out + } + application/pdf { + set pdff [open test.pdf w] + puts -nonewline $pdff $URL($url,buf) + close $pdff + } } puts $out "" RobotFileClose $out @@ -576,7 +648,6 @@ proc Robot200 {url} { proc RobotReadContent {url sock binary} { global URL - puts "RobotReadContent $url" set buffer [read $sock 16384] set readCount [string length $buffer] @@ -593,9 +664,11 @@ proc RobotReadContent {url sock binary} { } proc RobotReadHeader {url sock} { - global URL + global URL debuglevel - puts "RobotReadHeader $url" + if {$debuglevel > 1} { + puts "HTTP head $url" + } if {[catch {set buffer [read $sock 2148]}]} { RobotError $url 404 RobotRestart $url $sock @@ -620,11 +693,11 @@ proc RobotReadHeader {url sock} { regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code set lines [split $headbuf \n] foreach line $lines { - if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} { + if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { set URL($url,head,[string tolower $name]) [string trim $value] } } - puts "code = $code" + puts "HTTP CODE $code" set URL($url,state) skip switch $code { 301 { @@ -664,13 +737,16 @@ proc RobotSockCancel {url sock} { } proc RobotConnect {url sock} { - global URL agent + global URL agent acceptLanguage fconfigure $sock -translation {lf crlf} -blocking 0 fileevent $sock readable [list RobotReadHeader $url $sock] puts $sock "GET $URL($url,path) HTTP/1.0" puts $sock "Host: $URL($url,host)" puts $sock "User-Agent: $agent" + if {[string length $acceptLanguage]} { + puts $sock "Accept-Language: $acceptLanguage" + } puts $sock "" flush $sock set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]] @@ -683,7 +759,7 @@ proc RobotNop {} { proc RobotGetUrl {url phost} { global URL robotsRunning flush stdout - puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url" + puts "Retrieve $robotsRunning url=$url" if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { return -1 } @@ -707,7 +783,7 @@ proc RobotGetUrl {url phost} { set buf [read $inf 32768] close $inf } else { - set buf "User-Agent: *\nAllow: /\n" + set buf "User-agent: *\nAllow: /\n" } RobotsTxt0 URL($hostport,robots) $buf } @@ -720,6 +796,7 @@ proc RobotGetUrl {url phost} { } } if {!$ok} { + puts "skipped due to robots.txt" return -1 } } @@ -738,7 +815,7 @@ if {![llength [info commands htmlSwitch]]} { } } -set agent "zmbot/0.0" +set agent "zmbot/0.1" if {![catch {set os [exec uname -s -r]}]} { set agent "$agent ($os)" } @@ -754,17 +831,97 @@ proc bgerror {m} { set robotsRunning 0 set robotSeq 0 set workdir [pwd] -set idleTime 60000 +set idletime 60000 +set acceptLanguage {} set i 0 set l [llength $argv] if {$l < 2} { - puts {tclrobot: usage [-j jobs] [-i idle] [-c count] [-d domain] [url ..]} + puts {tclrobot: usage:} + puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]} puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" exit 1 } +# Rules: allow, deny, url +set debuglevel 0 + +proc checkrule {type this} { + global alrules + global debuglevel + + if {$debuglevel > 3} { + puts "CHECKRULE $type $this" + } + if {[info exist alrules]} { + foreach l $alrules { + if {$debuglevel > 3} { + puts "consider $l" + } + # consider type + if {[lindex $l 1] != $type} continue + # consider mask (! negates) + set mask [lindex $l 2] + if {[string index $mask 0] == "!"} { + set mask [string range $mask 1 end] + if {[string match $mask $this]} continue + } else { + if {![string match $mask $this]} continue + } + # OK, we have a match + if {[lindex $l 0] == "allow"} { + if {$debuglevel > 3} { + puts "CHECKRULE MATH OK" + } + return 1 + } else { + if {$debuglevel > 3} { + puts "CHECKFULE MATCH FAIL" + } + return 0 + } + } + } + if {$debuglevel > 3} { + puts "CHECKRULE MATH OK" + } + return 1 +} + + +proc url {href} { + global debuglevel + + if {[RobotHref http://www.indexdata.dk/ href host path]} { + if {![RobotFileExist visited $host $path]} { + set outf [RobotFileOpen unvisited $host $path] + RobotWriteRecord $outf href 0 + RobotFileClose $outf + } + } +} + +proc deny {type stuff} { + global alrules + + lappend alrules [list deny $type $stuff] +} + +proc allow {type stuff} { + global alrules + + lappend alrules [list allow $type $stuff] +} + +proc debug {level} { + global debuglevel + + set debuglevel $level +} + +# Parse options + while {$i < $l} { set arg [lindex $argv $i] switch -glob -- $arg { @@ -775,9 +932,9 @@ while {$i < $l} { } } -c* { - set maxDistance [string range $arg 2 end] - if {![string length $maxDistance]} { - set maxDistance [lindex $argv [incr i]] + set maxdistance [string range $arg 2 end] + if {![string length $maxdistance]} { + set maxdistance [lindex $argv [incr i]] } } -d* { @@ -788,11 +945,24 @@ while {$i < $l} { lappend domains $dom } -i* { - set idleTime [string range $arg 2 end] - if {![string length $idleTime]} { - set idleTime [lindex $argv [incr i]] + set idletime [string range $arg 2 end] + if {![string length $idletime]} { + set idletime [lindex $argv [incr i]] } } + -l* { + set acceptLanguage [string range $arg 2 end] + if {![string length $acceptLanguage]} { + set acceptLanguage [lindex $argv [incr i]] + } + } + -r* { + set rfile [string range $arg 2 end] + if {![string length $rfile]} { + set rfile [lindex $argv [incr i]] + } + source $rfile + } default { set href $arg if {[RobotHref http://www.indexdata.dk/ href host path]} { @@ -810,15 +980,15 @@ while {$i < $l} { if {![info exist domains]} { set domains {*} } -if {![info exist maxDistance]} { - set maxDistance 3 +if {![info exist maxdistance]} { + set maxdistance 50 } if {![info exist robotsMax]} { set robotsMax 5 } puts "domains=$domains" -puts "max distance=$maxDistance" +puts "max distance=$maxdistance" puts "max jobs=$robotsMax" RobotStart