2 # $Id: robot.tcl,v 1.9 2000/12/11 17:11:03 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
13 return $lead/[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set off [string last / $n]
20 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
21 if {[string length $sb]} {
29 proc RobotFileWait {} {
34 proc RobotFileNext {area} {
36 puts "RobotFileNext robotSeq=$robotSeq"
37 if {[catch {set ns [glob ${area}/*]}]} {
40 set off [string length $area]
43 set n [lindex $ns $robotSeq]
44 if {![string length $n]} {
46 puts "------------ N E X T R O U N D --------"
48 after 60000 RobotFileWait
51 set n [lindex $ns $robotSeq]
52 if {![string length $n]} {
53 puts "robotSeq = $robotSeq"
55 puts "no more work at index"
60 if {[file isfile $n/frobots.txt]} {
61 puts "ok returning http://[string range $n $off end]/robots.txt"
62 return http://[string range $n $off end]/robots.txt
63 } elseif {[file isdirectory $n]} {
64 set sb [RobotFileNext1 $n http://[string range $n $off end]]
65 if {[string length $sb]} {
69 puts "no more work at end of RobotFileNext n=$n"
75 proc RobotFileExist {area host path} {
76 puts "RobotFileExist begin"
77 puts "area=$area host=$host path=$path"
78 set lpath [split $path /]
79 set l [llength $lpath]
81 set t [lindex $lpath $l]
83 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
85 puts "RobotFileExist end"
86 return [file exists $npath]
89 proc RobotFileUnlink {area host path} {
90 puts "RobotFileUnlink begin"
91 puts "area=$area host=$host path=$path"
92 set lpath [split $path /]
93 set l [llength $lpath]
95 set t [lindex $lpath $l]
97 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
99 set comp [split $npath /]
100 set l [llength $comp]
102 if {[catch {exec rm [join $comp /]}]} return
104 for {set i $l} {$i > 0} {incr i -1} {
105 set path [join [lrange $comp 0 $i] /]
106 if {![catch {glob $path/*}]} return
109 puts "RobotFileUnlink end"
112 proc RobotFileClose {out} {
113 if [string compare $out stdout] {
118 proc RobotFileOpen {area host path {mode w}} {
122 if {![info exists workdir]} {
125 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
126 if {[string compare $orgPwd $workdir]} {
127 puts "ooops. RobotFileOpen failed"
128 puts "workdir = $workdir"
132 set comp [split $area/$host$path /]
133 set len [llength $comp]
135 for {set i 0} {$i < $len} {incr i} {
137 set d "d[lindex $comp $i]"
139 set d [lindex $comp $i]
141 if {[catch {cd ./$d}]} {
145 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
146 set out [open frobots.txt w]
147 puts "creating robots.txt in $d"
152 set d [lindex $comp $len]
153 if {[string length $d]} {
154 if {[file isdirectory $d]} {
155 set out [open $d/f $mode]
158 set out [open f$d $mode]
162 set out [open f $mode]
166 #puts "RobotFileStop"
170 proc RobotRestart {sock} {
175 after cancel $URL($sock,cancel)
177 set url [RobotFileNext unvisited]
178 if {![string length $url]} {
181 set r [RobotGetUrl $url {}]
185 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
188 incr robotMoreWork -1
191 proc headSave {url out} {
194 if {[info exists URL($url,head,last-modified)]} {
195 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
198 if {[info exists URL($url,head,date)]} {
199 puts $out " <date>$URL($url,head,date)</date>"
201 if {[info exists URL($url,head,content-length)]} {
202 puts $out " <by>$URL($url,head,content-length)</by>"
204 if {[info exists URL($url,head,server)]} {
205 puts $out " <format>$URL($url,head,server)</format>"
208 puts $out {<publisher>}
209 puts $out " <identifier>$url</identifier>"
210 if {[info exists URL($url,head,content-type)]} {
211 puts $out " <type>$URL($url,head,content-type)</type>"
213 puts $out {</publisher>}
216 proc RobotHref {url hrefx hostx pathx} {
222 puts "Ref url = $url href=$href"
223 # get method (if any)
224 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
228 if {[string compare $method http]} {
233 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
234 if {![string length $surl]} {
238 foreach domain $domains {
239 if {[string match $domain $host]} {
248 regexp {^([^\#]*)} $hpath x surl
249 set host $URL($url,host)
251 if {![string length $surl]} {
254 if {[string first / $surl]} {
256 regexp {^([^\#?]*)} $URL($url,path) x dpart
257 set l [string last / $dpart]
258 if {[expr $l >= 0]} {
259 set surl [string range $dpart 0 $l]$surl
261 set surl $dpart/$surl
264 set c [split $surl /]
267 set path [lindex $c $i]
270 switch -- [lindex $c $i] {
281 set path [lindex $c $i]/$path
286 regsub -all {~} $path {%7E} path
288 if {[info exists URL($host,robots)]} {
289 foreach l $URL($host,robots) {
290 if {[string first [lindex $l 1] $path] == 0} {
296 set href "$method://$host$path"
297 puts "Ref href = $href, ok=$ok"
301 proc RobotError {url code} {
304 puts "Bad URL $url, $code"
306 if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} {
307 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
308 set fromurl [gets $inf]
311 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
312 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
313 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
314 puts $outf "URL=$url $code"
315 puts $outf "Reference $fromurl"
320 proc RobotRedirect {url tourl code} {
323 puts "Redirecting from $url to $tourl"
326 if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} {
327 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
328 set fromurl [gets $inf]
331 if {[catch {RobotFileUnlink unvisited $URL($url,host) $URL($url,path)}]} {
335 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
336 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
337 puts $outf "URL=$url to $tourl $code"
338 puts $outf "Reference $fromurl"
341 if {[RobotHref $url tourl host path]} {
342 if {![RobotFileExist unvisited $host $path]} {
343 puts "Mark as unvisited"
344 set outf [RobotFileOpen unvisited $host $path]
351 proc RobotTextHtml {url out} {
354 htmlSwitch $URL($url,buf) \
356 puts $out "<title>$body</title>"
358 puts -nonewline $out "<meta"
359 foreach a [array names parm] {
360 puts -nonewline $out " $a"
361 puts -nonewline $out {="}
362 puts -nonewline $out $parm($a)
363 puts -nonewline $out {"}
367 regsub -all -nocase {<script.*</script>} $body {} abody
368 regsub -all {<[^\>]+>} $abody {} nbody
369 puts $out "<documentcontent>"
371 puts $out "</documentcontent>"
373 if {![info exists parm(href)]} {
379 if {![RobotHref $url href host path]} continue
382 puts $out "<identifier>$href</identifier>"
383 puts $out "<description>$body</description>"
386 if {![RobotFileExist visited $host $path]} {
387 if {![RobotFileExist bad $host $path]} {
388 if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
389 puts "--- Error $msg"
400 proc RobotsTxt {url} {
403 set v URL($URL($url,host),robots)
405 foreach l [split $URL($url,buf) \n] {
407 if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
408 puts "cmd=$cmd arg=$arg"
412 set pat [string tolower $arg]*
413 set section [string match $pat $agent]
417 puts "rule [list 0 $arg]"
418 lappend $v [list 0 $arg]
423 puts "rule [list 1 $arg]"
424 lappend $v [list 1 $arg]
432 proc RobotTextPlain {url out} {
435 puts $out "<documentcontent>"
436 puts $out $URL($url,buf)
437 puts $out "</documentcontent>"
439 if {![string compare $URL($url,path) /robots.txt]} {
444 proc Robot200 {url} {
448 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
451 switch $URL($url,head,content-type) {
453 RobotTextHtml $url $out
456 RobotTextPlain $url $out
459 set pdff [open test.pdf w]
460 puts -nonewline $pdff $URL($url,buf)
466 # puts "Parsing done"
467 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
470 proc RobotReadContent {url sock binary} {
473 set buffer [read $sock 16384]
474 set readCount [string length $buffer]
476 if {$readCount <= 0} {
479 } elseif {!$binary && [string first \0 $buffer] >= 0} {
483 # puts "Got $readCount bytes"
484 set URL($url,buf) $URL($url,buf)$buffer
488 proc RobotReadHeader {url sock} {
491 puts "RobotReadHeader $url"
492 if {[catch {set buffer [read $sock 2148]}]} {
496 set readCount [string length $buffer]
498 if {$readCount <= 0} {
502 # puts "Got $readCount bytes"
503 set URL($url,buf) $URL($url,buf)$buffer
505 set n [string first \r\n\r\n $URL($url,buf)]
509 set headbuf [string range $URL($url,buf) 0 $n]
511 set URL($url,buf) [string range $URL($url,buf) $n end]
513 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
514 set lines [split $headbuf \n]
515 foreach line $lines {
516 if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
517 set URL($url,head,[string tolower $name]) [string trim $value]
521 set URL($url,state) skip
524 RobotRedirect $url $URL($url,head,location) 301
528 RobotRedirect $url $URL($url,head,location) 302
532 if {![info exists URL($url,head,content-type)]} {
533 set URL($url,head,content-type) {}
536 switch $URL($url,head,content-type) {
541 fileevent $sock readable [list RobotReadContent $url $sock $binary]
544 RobotError $url $code
552 proc RobotSockCancel {sock url} {
554 puts "RobotSockCancel sock=$sock url=$url"
559 proc RobotConnect {url sock} {
562 fconfigure $sock -translation {lf crlf} -blocking 0
563 fileevent $sock readable [list RobotReadHeader $url $sock]
564 puts $sock "GET $URL($url,path) HTTP/1.0"
565 puts $sock "Host: $URL($url,host)"
566 puts $sock "User-Agent: $agent"
569 set URL($sock,cancel) [after 60000 [list RobotSockCancel $sock $url]]
576 proc RobotGetUrl {url phost} {
581 if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
584 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
588 set URL($url,method) $method
589 set URL($url,host) $host
590 set URL($url,port) $port
591 set URL($url,path) $path
592 set URL($url,state) head
594 if [catch {set sock [socket -async $host $port]}] {
597 RobotConnect $url $sock
602 if {![llength [info commands htmlSwitch]]} {
603 set e [info sharedlibextension]
604 if {[catch {load ./tclrobot$e}]} {
609 set agent "zmbot/0.0"
610 if {![catch {set os [exec uname -s -r]}]} {
611 set agent "$agent ($os)"
625 if {[llength $argv] < 2} {
626 puts "Tclrobot: usage <domain> <start>"
627 puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
631 set domains [lindex $argv 0]
632 foreach site [lindex $argv 1] {
634 if [RobotGetUrl $site {}] {
635 incr robotMoreWork -1
636 puts "Couldn't process $site"
640 while {$robotMoreWork} {