2 # $Id: robot.tcl,v 1.10 2001/01/23 09:20:32 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
13 return $lead/[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set off [string last / $n]
20 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
21 if {[string length $sb]} {
29 proc RobotWriteRecord {outf fromurl distance} {
31 puts $outf "<distance>"
33 puts $outf "</distance>"
34 puts $outf "<fromurl>"
36 puts $outf "</fromurl>"
40 proc RobotReadRecord {inf fromurlx distancex} {
41 upvar $fromurlx fromurl
42 upvar $distancex distance
45 set distance [string trim [gets $inf]]
46 puts "got distance = $distance"
49 set fromurl [string trim [gets $inf]]
52 proc RobotFileNext {area} {
53 global robotSeq global idleTime ns
55 puts "RobotFileNext robotSeq=$robotSeq"
60 if {[catch {set ns [glob ${area}/*]}]} {
64 set off [string length $area]
66 set n [lindex $ns $robotSeq]
67 if {![string length $n]} {
70 puts "------------ N E X T R O U N D --------"
74 if {[file isfile $n/frobots.txt]} {
75 puts "ok returning http://[string range $n $off end]/robots.txt"
76 return http://[string range $n $off end]/robots.txt
77 } elseif {[file isdirectory $n]} {
78 set sb [RobotFileNext1 $n http://[string range $n $off end]]
79 if {[string length $sb]} {
83 puts "no more work at end of RobotFileNext n=$n"
89 proc RobotFileExist {area host path} {
90 puts "RobotFileExist begin area=$area host=$host path=$path"
91 set lpath [split $path /]
92 set l [llength $lpath]
94 set t [lindex $lpath $l]
96 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
97 puts "RobotFileExist end npath=$npath"
98 return [file exists $npath]
101 proc RobotFileUnlink {area host path} {
102 puts "RobotFileUnlink begin"
103 puts "area=$area host=$host path=$path"
104 set lpath [split $path /]
105 set l [llength $lpath]
107 set t [lindex $lpath $l]
109 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
111 set comp [split $npath /]
112 set l [llength $comp]
114 if {[catch {exec rm [join $comp /]}]} return
116 for {set i $l} {$i > 0} {incr i -1} {
117 set path [join [lrange $comp 0 $i] /]
118 if {![catch {glob $path/*}]} return
121 puts "RobotFileUnlink end"
124 proc RobotFileClose {out} {
125 if [string compare $out stdout] {
130 proc RobotFileOpen {area host path {mode w}} {
134 if {![info exists workdir]} {
137 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
138 if {[string compare $orgPwd $workdir]} {
139 puts "ooops. RobotFileOpen failed"
140 puts "workdir = $workdir"
144 set comp [split $area/$host$path /]
145 set len [llength $comp]
147 for {set i 0} {$i < $len} {incr i} {
149 set d "d[lindex $comp $i]"
151 set d [lindex $comp $i]
153 if {[catch {cd ./$d}]} {
156 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
157 set out [open frobots.txt w]
158 puts "creating robots.txt in $d"
163 set d [lindex $comp $len]
164 if {[string length $d]} {
165 if {[file isdirectory $d]} {
166 set out [open $d/f $mode]
168 set out [open f$d $mode]
171 set out [open f $mode]
178 global robotSeq robotsRunning
180 incr robotsRunning -1
181 while {$robotsRunning} {
188 proc RobotRestart {url sock} {
189 global URL robotsRunning
192 after cancel $URL($sock,cancel)
194 foreach v [array names URL $url,*] {
198 incr robotsRunning -1
204 global robotsRunning robotsMax idleTime
208 set url [RobotFileNext unvisited]
209 if {![string length $url]} {
213 if {[string compare $url wait] == 0} {
214 after $idleTime RobotRR
217 set r [RobotGetUrl $url {}]
219 if {$robotsRunning >= $robotsMax} return
221 incr robotsRunning -1
222 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
223 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
226 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
231 proc headSave {url out} {
234 if {[info exists URL($url,head,last-modified)]} {
235 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
238 if {[info exists URL($url,head,date)]} {
239 puts $out " <date>$URL($url,head,date)</date>"
241 if {[info exists URL($url,head,content-length)]} {
242 puts $out " <by>$URL($url,head,content-length)</by>"
244 if {[info exists URL($url,head,server)]} {
245 puts $out " <format>$URL($url,head,server)</format>"
248 puts $out {<publisher>}
249 puts $out " <identifier>$url</identifier>"
250 if {[info exists URL($url,head,content-type)]} {
251 puts $out " <type>$URL($url,head,content-type)</type>"
253 puts $out {</publisher>}
256 proc RobotHref {url hrefx hostx pathx} {
262 puts "Ref url = $url href=$href"
263 # get method (if any)
264 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
268 if {[string compare $method http]} {
273 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
274 if {![string length $surl]} {
278 foreach domain $domains {
279 if {[string match $domain $host]} {
288 regexp {^([^\#]*)} $hpath x surl
289 set host $URL($url,hostport)
291 if {![string length $surl]} {
294 if {[string first / $surl]} {
296 regexp {^([^\#?]*)} $URL($url,path) x dpart
297 set l [string last / $dpart]
298 if {[expr $l >= 0]} {
299 set surl [string range $dpart 0 $l]$surl
301 set surl $dpart/$surl
304 set c [split $surl /]
307 set path [lindex $c $i]
310 switch -- [lindex $c $i] {
321 set path [lindex $c $i]/$path
326 regsub -all {~} $path {%7E} path
327 set href "$method://$host$path"
328 puts "Ref href = $href"
332 proc RobotError {url code} {
335 puts "Bad URL $url, $code"
338 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
339 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
340 RobotReadRecord $inf fromurl distance
343 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
344 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
345 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
346 RobotWriteRecord $outf $fromurl $distance
351 proc RobotRedirect {url tourl code} {
354 puts "Redirecting from $url to $tourl"
358 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
359 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
360 RobotReadRecord $inf fromurl distance
363 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
364 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
365 RobotWriteRecord $outf $fromurl $distance
368 if {[RobotHref $url tourl host path]} {
369 if {![RobotFileExist visited $host $path]} {
370 if {![RobotFileExist unvisited $host $path]} {
371 set outf [RobotFileOpen unvisited $host $path]
372 RobotWriteRecord $outf $fromurl $distance
377 set inf [RobotFileOpen visited $host $path r]
378 RobotReadRecord $inf oldurl olddistance
380 if {[string length $olddistance] == 0} {
383 if {[string length $distance] == 0} {
386 puts "distance=$distance olddistance=$olddistance"
387 if {[expr $distance < $olddistance]} {
388 set outf [RobotFileOpen unvisited $host $path]
389 RobotWriteRecord $outf $tourl $distance
394 if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
400 proc RobotTextHtml {url out} {
401 global URL maxDistance
404 if {$maxDistance < 1000 && [info exists URL($url,dist)]} {
405 set distance [expr $URL($url,dist) + 1]
407 htmlSwitch $URL($url,buf) \
409 puts $out "<title>$body</title>"
411 puts -nonewline $out "<meta"
412 foreach a [array names parm] {
413 puts -nonewline $out " $a"
414 puts -nonewline $out {="}
415 puts -nonewline $out $parm($a)
416 puts -nonewline $out {"}
420 regsub -all -nocase {<script.*</script>} $body {} abody
421 regsub -all {<[^\>]+>} $abody {} nbody
422 puts $out "<documentcontent>"
424 puts $out "</documentcontent>"
426 if {![info exists parm(href)]} {
430 if {[expr $distance <= $maxDistance]} {
431 set href [string trim $parm(href)]
432 if {![RobotHref $url href host path]} continue
435 puts $out "<identifier>$href</identifier>"
436 puts $out "<description>$body</description>"
439 if {![RobotFileExist visited $host $path]} {
441 if {![RobotFileExist bad $host $path]} {
442 if {[RobotFileExist unvisited $host $path]} {
443 set inf [RobotFileOpen unvisited $host $path r]
444 RobotReadRecord $inf oldurl olddistance
450 if {[string length $olddistance] == 0} {
453 if {[expr $distance < $olddistance]} {
454 set outf [RobotFileOpen unvisited $host $path]
455 RobotWriteRecord $outf $url $distance
458 } elseif {[string compare $href $url]} {
459 set inf [RobotFileOpen visited $host $path r]
460 RobotReadRecord $inf xurl olddistance
462 if {[string length $olddistance] == 0} {
465 if {[expr $distance < $olddistance]} {
466 puts "OK remarking url=$url href=$href"
467 puts "olddistance = $olddistance"
468 puts "newdistance = $distance"
469 set outf [RobotFileOpen unvisited $host $path]
470 RobotWriteRecord $outf $url $distance
478 proc RobotsTxt {url} {
481 RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
484 proc RobotsTxt0 {v buf} {
487 foreach l [split $buf \n] {
488 if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
489 puts "cmd=$cmd arg=$arg"
493 set pat [string tolower $arg]*
494 set section [string match $pat $agent]
498 puts "rule [list 0 $arg]"
499 lappend $v [list 0 $arg]
504 puts "rule [list 1 $arg]"
505 lappend $v [list 1 $arg]
513 proc RobotTextPlain {url out} {
516 puts $out "<documentcontent>"
517 puts $out $URL($url,buf)
518 puts $out "</documentcontent>"
520 if {![string compare $URL($url,path) /robots.txt]} {
525 proc Robot200 {url} {
528 set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
532 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
533 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
534 RobotReadRecord $inf fromurl distance
537 set URL($url,dist) $distance
538 puts $out "<distance>"
539 puts $out " $distance"
540 puts $out "</distance>"
542 puts "Parsing $url distance=$distance"
543 switch $URL($url,head,content-type) {
545 if {[string length $distance]} {
546 RobotTextHtml $url $out
550 RobotTextPlain $url $out
553 set pdff [open test.pdf w]
554 puts -nonewline $pdff $URL($url,buf)
560 # puts "Parsing done"
561 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
564 proc RobotReadContent {url sock binary} {
567 puts "RobotReadContent $url"
568 set buffer [read $sock 16384]
569 set readCount [string length $buffer]
571 if {$readCount <= 0} {
573 RobotRestart $url $sock
574 } elseif {!$binary && [string first \0 $buffer] >= 0} {
576 RobotRestart $url $sock
578 # puts "Got $readCount bytes"
579 set URL($url,buf) $URL($url,buf)$buffer
583 proc RobotReadHeader {url sock} {
586 puts "RobotReadHeader $url"
587 if {[catch {set buffer [read $sock 2148]}]} {
589 RobotRestart $url $sock
591 set readCount [string length $buffer]
593 if {$readCount <= 0} {
595 RobotRestart $url $sock
597 # puts "Got $readCount bytes"
598 set URL($url,buf) $URL($url,buf)$buffer
600 set n [string first \r\n\r\n $URL($url,buf)]
604 set headbuf [string range $URL($url,buf) 0 $n]
606 set URL($url,buf) [string range $URL($url,buf) $n end]
608 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
609 set lines [split $headbuf \n]
610 foreach line $lines {
611 if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
612 set URL($url,head,[string tolower $name]) [string trim $value]
616 set URL($url,state) skip
619 RobotRedirect $url $URL($url,head,location) 301
620 RobotRestart $url $sock
623 RobotRedirect $url $URL($url,head,location) 302
624 RobotRestart $url $sock
627 if {![info exists URL($url,head,content-type)]} {
628 set URL($url,head,content-type) {}
631 switch $URL($url,head,content-type) {
636 fileevent $sock readable [list RobotReadContent $url $sock $binary]
639 RobotError $url $code
640 RobotRestart $url $sock
647 proc RobotSockCancel {url sock} {
649 puts "RobotSockCancel sock=$sock url=$url"
651 RobotRestart $url $sock
654 proc RobotConnect {url sock} {
657 fconfigure $sock -translation {lf crlf} -blocking 0
658 fileevent $sock readable [list RobotReadHeader $url $sock]
659 puts $sock "GET $URL($url,path) HTTP/1.0"
660 puts $sock "Host: $URL($url,host)"
661 puts $sock "User-Agent: $agent"
664 set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
671 proc RobotGetUrl {url phost} {
672 global URL robotsRunning
674 puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url"
675 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
678 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
682 set URL($url,method) $method
683 set URL($url,host) $host
684 set URL($url,hostport) $hostport
685 set URL($url,path) $path
686 set URL($url,state) head
689 if {[string compare $path /robots.txt]} {
691 if {![info exists URL($hostport,robots)]} {
692 puts "READING robots.txt for host $hostport"
693 if {[RobotFileExist visited $hostport /robots.txt]} {
694 set inf [RobotFileOpen visited $hostport /robots.txt r]
695 set buf [read $inf 32768]
698 set buf "User-Agent: *\nAllow: /\n"
700 RobotsTxt0 URL($hostport,robots) $buf
702 if {[info exists URL($hostport,robots)]} {
703 foreach l $URL($hostport,robots) {
704 if {[string first [lindex $l 1] $path] == 0} {
714 if [catch {set sock [socket -async $host $port]}] {
717 RobotConnect $url $sock
722 if {![llength [info commands htmlSwitch]]} {
723 set e [info sharedlibextension]
724 if {[catch {load ./tclrobot$e}]} {
729 set agent "zmbot/0.0"
730 if {![catch {set os [exec uname -s -r]}]} {
731 set agent "$agent ($os)"
747 if {[llength $argv] < 2} {
748 puts "Tclrobot: usage <range> <domain> <start>"
749 puts " Example: 3 '*.indexdata.dk' http://www.indexdata.dk/"
753 set maxDistance [lindex $argv 0]
754 set domains [lindex $argv 1]
755 foreach href [lindex $argv 2] {
756 if {[RobotHref http://www.indexdata.dk/ href host path]} {
757 if {![RobotFileExist visited $host $path]} {
758 set outf [RobotFileOpen unvisited $host $path]
759 RobotWriteRecord $outf $href 0
767 while {$robotsRunning} {