2 # $Id: robot.tcl,v 1.27 2001/11/09 13:26:50 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
13 return $lead/[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set off [string last / $n]
20 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
21 if {[string length $sb]} {
29 proc RobotWriteRecord {outf fromurl distance} {
31 puts $outf "<distance>"
33 puts $outf "</distance>"
34 puts $outf "<fromurl>"
36 puts $outf "</fromurl>"
40 proc RobotReadRecord {inf fromurlx distancex} {
41 upvar $fromurlx fromurl
42 upvar $distancex distance
45 set distance [string trim [gets $inf]]
46 # puts "got distance = $distance"
49 set fromurl [string trim [gets $inf]]
52 proc RobotFileNext {area} {
53 global robotSeq global idletime ns
55 # puts "RobotFileNext robotSeq=$robotSeq"
60 if {[catch {set ns [glob ${area}/*]}]} {
64 set off [string length $area]
66 set n [lindex $ns $robotSeq]
67 if {![string length $n]} {
74 if {[file isfile $n/frobots.txt]} {
75 puts "ok returning http://[string range $n $off end]/robots.txt"
76 return http://[string range $n $off end]/robots.txt
77 } elseif {[file isdirectory $n]} {
78 set sb [RobotFileNext1 $n http://[string range $n $off end]]
79 if {[string length $sb]} {
83 puts "no more work at end of RobotFileNext n=$n"
89 proc RobotFileExist {area host path} {
90 # puts "RobotFileExist begin area=$area host=$host path=$path"
91 set lpath [split $path /]
92 set l [llength $lpath]
94 set t [lindex $lpath $l]
96 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
97 # puts "RobotFileExist end npath=$npath"
98 return [file exists $npath]
101 proc RobotFileUnlink {area host path} {
102 # puts "RobotFileUnlink begin"
103 # puts "area=$area host=$host path=$path"
104 set lpath [split $path /]
105 set l [llength $lpath]
107 set t [lindex $lpath $l]
109 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
110 # puts "npath=$npath"
111 set comp [split $npath /]
112 set l [llength $comp]
114 if {[catch {exec rm [join $comp /]}]} return
116 for {set i $l} {$i > 0} {incr i -1} {
117 set path [join [lrange $comp 0 $i] /]
118 if {![catch {glob $path/*}]} return
121 # puts "RobotFileUnlink end"
124 proc RobotFileClose {out} {
125 if [string compare $out stdout] {
130 proc RobotFileOpen {area host path {mode w}} {
134 if {![info exists workdir]} {
137 #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
138 if {[string compare $orgPwd $workdir]} {
139 puts "ooops. RobotFileOpen failed"
140 puts "workdir = $workdir"
144 set comp [split $area/$host$path /]
145 set len [llength $comp]
147 for {set i 0} {$i < $len} {incr i} {
149 set d "d[lindex $comp $i]"
151 set d [lindex $comp $i]
153 if {[catch {cd ./$d}]} {
156 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
157 set out [open frobots.txt w]
158 puts "creating robots.txt in $d"
163 set d [lindex $comp $len]
164 if {[string length $d]} {
165 if {[file isdirectory $d]} {
166 set out [open $d/f $mode]
168 set out [open f$d $mode]
171 set out [open f $mode]
178 global robotSeq robotsRunning
180 incr robotsRunning -1
181 while {$robotsRunning} {
188 proc RobotRestart {url sock} {
189 global URL robotsRunning
192 after cancel $URL($sock,cancel)
194 foreach v [array names URL $url,*] {
198 incr robotsRunning -1
204 global robotsRunning robotsMax idletime
208 set url [RobotFileNext unvisited]
209 if {![string length $url]} {
213 if {[string compare $url wait] == 0} {
214 after $idletime RobotRR
217 set r [RobotGetUrl $url {}]
219 if {$robotsRunning >= $robotsMax} return
221 incr robotsRunning -1
222 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
223 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
226 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
231 proc headSave {url out} {
234 if {[info exists URL($url,head,last-modified)]} {
235 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
238 if {[info exists URL($url,head,date)]} {
239 puts $out " <date>$URL($url,head,date)</date>"
241 if {[info exists URL($url,head,content-length)]} {
242 puts $out " <by>$URL($url,head,content-length)</by>"
244 if {[info exists URL($url,head,server)]} {
245 puts $out " <format>$URL($url,head,server)</format>"
248 puts $out {<publisher>}
249 puts $out " <identifier>$url</identifier>"
250 if {[info exists URL($url,head,content-type)]} {
251 puts $out " <type>$URL($url,head,content-type)</type>"
253 puts $out {</publisher>}
256 proc RobotHref {url hrefx hostx pathx} {
257 global URL domains debuglevel
262 if {$debuglevel > 1} {
263 puts "Ref input url = $url href=$href"
266 if {[string first { } $href] >= 0} {
269 if {[string length $href] > 256} {
272 if {[string first {?} $href] >= 0} {
275 if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
278 # get method (if any)
279 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
283 if {[string compare $method http]} {
288 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
289 if {![string length $surl]} {
292 if {[info exist domains]} {
294 foreach domain $domains {
295 if {[string match $domain $host]} {
305 regexp {^([^\#]*)} $hpath x surl
306 set host $URL($url,hostport)
308 if {![string length $surl]} {
311 if {[string first / $surl]} {
313 regexp {^([^\#?]*)} $URL($url,path) x dpart
314 set l [string last / $dpart]
315 if {[expr $l >= 0]} {
316 set surl [string range $dpart 0 $l]$surl
318 set surl $dpart/$surl
321 set surllist [split $surl /]
324 foreach c $surllist {
329 set path [lrange $path 0 $pathl]
342 if {$debuglevel > 4} {
343 puts "pathl=$pathl output path=$path"
345 set path [join $path /]
346 if {![string length $path]} {
349 regsub -all {~} $path {%7E} path
350 set href "$method://$host$path"
352 if {$debuglevel > 1} {
353 puts "Ref result = $href"
355 return [checkrule url $href]
358 proc RobotError {url code} {
361 puts "Bad URL $url (code $code)"
364 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
365 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
366 RobotReadRecord $inf fromurl distance
369 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
370 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
371 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
372 RobotWriteRecord $outf $fromurl $distance
377 proc RobotRedirect {url tourl code} {
380 puts "Redirecting from $url to $tourl"
384 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
385 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
386 RobotReadRecord $inf fromurl distance
389 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
390 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
391 RobotWriteRecord $outf $fromurl $distance
394 if {[RobotHref $url tourl host path]} {
395 if {![RobotFileExist visited $host $path]} {
396 if {![RobotFileExist unvisited $host $path]} {
397 set outf [RobotFileOpen unvisited $host $path]
398 RobotWriteRecord $outf $fromurl $distance
403 set inf [RobotFileOpen visited $host $path r]
404 RobotReadRecord $inf oldurl olddistance
406 if {[string length $olddistance] == 0} {
409 if {[string length $distance] == 0} {
412 puts "distance=$distance olddistance=$olddistance"
413 if {[expr $distance < $olddistance]} {
414 set outf [RobotFileOpen unvisited $host $path]
415 RobotWriteRecord $outf $tourl $distance
420 if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
426 proc RobotTextHtml {url out} {
427 global URL maxdistance
430 if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
431 set distance [expr $URL($url,dist) + 1]
433 htmlSwitch $URL($url,buf) \
435 puts $out "<title>$body</title>"
437 puts -nonewline $out "<meta"
438 foreach a [array names parm] {
439 puts -nonewline $out " $a"
440 puts -nonewline $out {="}
441 puts -nonewline $out $parm($a)
442 puts -nonewline $out {"}
446 regsub -all {<!--[^-]*->} $body { } abody
447 regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
448 regsub -all {<[^\>]+>} $bbody {} nbody
449 puts $out "<documentcontent>"
451 puts $out "</documentcontent>"
453 if {![info exists parm(href)]} {
457 if {[expr $distance <= $maxdistance]} {
458 set href [string trim $parm(href)]
459 if {![RobotHref $url href host path]} continue
462 puts $out "<identifier>$href</identifier>"
463 puts $out "<description>$body</description>"
466 if {![RobotFileExist visited $host $path]} {
468 if {![RobotFileExist bad $host $path]} {
469 if {[RobotFileExist unvisited $host $path]} {
470 set inf [RobotFileOpen unvisited $host $path r]
471 RobotReadRecord $inf oldurl olddistance
477 if {[string length $olddistance] == 0} {
480 if {[expr $distance < $olddistance]} {
481 set outf [RobotFileOpen unvisited $host $path]
482 RobotWriteRecord $outf $url $distance
485 } elseif {[string compare $href $url]} {
486 set inf [RobotFileOpen visited $host $path r]
487 RobotReadRecord $inf xurl olddistance
489 if {[string length $olddistance] == 0} {
492 if {[expr $distance < $olddistance]} {
493 puts "OK remarking url=$url href=$href"
494 puts "olddistance = $olddistance"
495 puts "newdistance = $distance"
496 set outf [RobotFileOpen unvisited $host $path]
497 RobotWriteRecord $outf $url $distance
503 if {![info exists parm(href)]} {
507 if {[expr $distance <= $maxdistance]} {
508 set href [string trim $parm(href)]
509 if {![RobotHref $url href host path]} continue
512 puts $out "<identifier>$href</identifier>"
513 puts $out "<description></description>"
516 if {![RobotFileExist visited $host $path]} {
518 if {![RobotFileExist bad $host $path]} {
519 if {[RobotFileExist unvisited $host $path]} {
520 set inf [RobotFileOpen unvisited $host $path r]
521 RobotReadRecord $inf oldurl olddistance
527 if {[string length $olddistance] == 0} {
530 if {[expr $distance < $olddistance]} {
531 set outf [RobotFileOpen unvisited $host $path]
532 RobotWriteRecord $outf $url $distance
535 } elseif {[string compare $href $url]} {
536 set inf [RobotFileOpen visited $host $path r]
537 RobotReadRecord $inf xurl olddistance
539 if {[string length $olddistance] == 0} {
542 if {[expr $distance < $olddistance]} {
543 puts "OK remarking url=$url href=$href"
544 puts "olddistance = $olddistance"
545 puts "newdistance = $distance"
546 set outf [RobotFileOpen unvisited $host $path]
547 RobotWriteRecord $outf $url $distance
553 if {![info exists parm(src)]} {
557 if {[expr $distance <= $maxdistance]} {
558 set href [string trim $parm(src)]
559 if {![RobotHref $url href host path]} continue
562 puts $out "<identifier>$href</identifier>"
563 puts $out "<description></description>"
566 if {![RobotFileExist visited $host $path]} {
568 if {![RobotFileExist bad $host $path]} {
569 if {[RobotFileExist unvisited $host $path]} {
570 set inf [RobotFileOpen unvisited $host $path r]
571 RobotReadRecord $inf oldurl olddistance
577 if {[string length $olddistance] == 0} {
580 if {[expr $distance < $olddistance]} {
581 set outf [RobotFileOpen unvisited $host $path]
582 RobotWriteRecord $outf $url $distance
585 } elseif {[string compare $href $url]} {
586 set inf [RobotFileOpen visited $host $path r]
587 RobotReadRecord $inf xurl olddistance
589 if {[string length $olddistance] == 0} {
592 if {[expr $distance < $olddistance]} {
593 puts "OK remarking url=$url href=$href"
594 puts "olddistance = $olddistance"
595 puts "newdistance = $distance"
596 set outf [RobotFileOpen unvisited $host $path]
597 RobotWriteRecord $outf $url $distance
605 proc RobotsTxt {url} {
608 RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
611 proc RobotsTxt0 {v buf} {
614 foreach l [split $buf \n] {
615 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
616 puts "cmd=$cmd arg=$arg"
617 switch -- [string tolower $cmd] {
620 set pat [string tolower $arg]*
621 set section [string match $pat $agent]
625 puts "rule [list 0 $arg]"
626 lappend $v [list 0 $arg]
631 puts "rule [list 1 $arg]"
632 lappend $v [list 1 $arg]
640 proc RobotTextPlain {url out} {
643 puts $out "<documentcontent>"
644 regsub -all {<} $URL($url,buf) {\<} content
646 puts $out "</documentcontent>"
648 if {![string compare $URL($url,path) /robots.txt]} {
653 proc RobotWriteMetadata {url out} {
659 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
660 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
661 RobotReadRecord $inf fromurl distance
664 set URL($url,dist) $distance
665 puts $out "<distance>"
666 puts $out " $distance"
667 puts $out "</distance>"
669 puts "Parsing $url distance=$distance"
670 switch $URL($url,head,content-type) {
672 if {[string length $distance]} {
673 RobotTextHtml $url $out
677 RobotTextPlain $url $out
680 set pdff [open test.pdf w]
681 puts -nonewline $pdff $URL($url,buf)
688 proc Robot200 {url} {
691 set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
692 puts -nonewline $out $URL($url,buf)
695 if {![checkrule mime $URL($url,head,content-type)]} {
696 RobotError $url mimedeny
699 set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
700 RobotWriteMetadata $url $out
703 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
706 proc RobotReadContent {url sock binary} {
709 set buffer [read $sock 16384]
710 set readCount [string length $buffer]
712 if {$readCount <= 0} {
714 RobotRestart $url $sock
715 } elseif {!$binary && [string first \0 $buffer] >= 0} {
717 RobotRestart $url $sock
719 # puts "Got $readCount bytes"
720 set URL($url,buf) $URL($url,buf)$buffer
724 proc RobotReadHeader {url sock} {
725 global URL debuglevel
727 if {$debuglevel > 1} {
728 puts "HTTP head $url"
730 if {[catch {set buffer [read $sock 2148]}]} {
732 RobotRestart $url $sock
734 set readCount [string length $buffer]
736 if {$readCount <= 0} {
738 RobotRestart $url $sock
740 # puts "Got $readCount bytes"
741 set URL($url,buf) $URL($url,buf)$buffer
743 set n [string first \r\n\r\n $URL($url,buf)]
747 set headbuf [string range $URL($url,buf) 0 $n]
749 set URL($url,buf) [string range $URL($url,buf) $n end]
751 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
752 set lines [split $headbuf \n]
753 foreach line $lines {
754 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
755 set URL($url,head,[string tolower $name]) [string trim $value]
758 puts "HTTP CODE $code"
759 set URL($url,state) skip
762 RobotRedirect $url $URL($url,head,location) 301
763 RobotRestart $url $sock
766 RobotRedirect $url $URL($url,head,location) 302
767 RobotRestart $url $sock
770 if {![info exists URL($url,head,content-type)]} {
771 set URL($url,head,content-type) {}
774 switch $URL($url,head,content-type) {
779 fileevent $sock readable [list RobotReadContent $url $sock $binary]
782 RobotError $url $code
783 RobotRestart $url $sock
790 proc RobotSockCancel {url sock} {
792 puts "RobotSockCancel sock=$sock url=$url"
794 RobotRestart $url $sock
797 proc RobotConnect {url sock} {
798 global URL agent acceptLanguage
800 fconfigure $sock -translation {lf crlf} -blocking 0
801 fileevent $sock readable [list RobotReadHeader $url $sock]
802 puts $sock "GET $URL($url,path) HTTP/1.0"
803 puts $sock "Host: $URL($url,host)"
804 puts $sock "User-Agent: $agent"
805 if {[string length $acceptLanguage]} {
806 puts $sock "Accept-Language: $acceptLanguage"
810 set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
817 proc RobotGetUrl {url phost} {
818 global URL robotsRunning
820 puts "Retrieve $robotsRunning url=$url"
821 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
824 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
828 set URL($url,method) $method
829 set URL($url,host) $host
830 set URL($url,hostport) $hostport
831 set URL($url,path) $path
832 set URL($url,state) head
835 if {[string compare $path /robots.txt]} {
837 if {![info exists URL($hostport,robots)]} {
838 puts "READING robots.txt for host $hostport"
839 if {[RobotFileExist visited $hostport /robots.txt]} {
840 set inf [RobotFileOpen visited $hostport /robots.txt r]
841 set buf [read $inf 32768]
844 set buf "User-agent: *\nAllow: /\n"
846 RobotsTxt0 URL($hostport,robots) $buf
848 if {[info exists URL($hostport,robots)]} {
849 foreach l $URL($hostport,robots) {
850 if {[string first [lindex $l 1] $path] == 0} {
857 puts "skipped due to robots.txt"
861 if [catch {set sock [socket -async $host $port]}] {
864 RobotConnect $url $sock
869 if {![llength [info commands htmlSwitch]]} {
870 set e [info sharedlibextension]
871 if {[catch {load ./tclrobot$e}]} {
876 set agent "zmbot/0.1"
877 if {![catch {set os [exec uname -s -r]}]} {
878 set agent "$agent ($os)"
893 set acceptLanguage {}
897 # Rules: allow, deny, url
899 proc checkrule {type this} {
903 if {$debuglevel > 3} {
904 puts "CHECKRULE $type $this"
906 if {[info exist alrules]} {
908 if {$debuglevel > 3} {
912 if {[lindex $l 1] != $type} continue
913 # consider mask (! negates)
914 set masks [lindex $l 2]
916 foreach mask $masks {
917 if {$debuglevel > 4} {
918 puts "consider single mask $mask"
920 if {[string index $mask 0] == "!"} {
921 set mask [string range $mask 1 end]
922 if {[string match $mask $this]} continue
924 if {![string match $mask $this]} continue
928 if {$debuglevel > 4} {
932 # OK, we have a match
933 if {[lindex $l 0] == "allow"} {
934 if {$debuglevel > 3} {
935 puts "CHECKRULE MATCH OK"
939 if {$debuglevel > 3} {
940 puts "CHECKFULE MATCH FAIL"
946 if {$debuglevel > 3} {
947 puts "CHECKRULE MATCH OK"
956 if {[RobotHref http://www.indexdata.dk/ href host path]} {
957 if {![RobotFileExist visited $host $path]} {
958 set outf [RobotFileOpen unvisited $host $path]
959 RobotWriteRecord $outf href 0
965 proc deny {type stuff} {
968 lappend alrules [list deny $type $stuff]
971 proc allow {type stuff} {
974 lappend alrules [list allow $type $stuff]
980 set debuglevel $level
986 set l [llength $argv]
989 puts {tclrobot: usage:}
990 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
991 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
996 set arg [lindex $argv $i]
997 switch -glob -- $arg {
999 set robotsMax [string range $arg 2 end]
1000 if {![string length $robotsMax]} {
1001 set robotsMax [lindex $argv [incr i]]
1005 set maxdistance [string range $arg 2 end]
1006 if {![string length $maxdistance]} {
1007 set maxdistance [lindex $argv [incr i]]
1011 set dom [string range $arg 2 end]
1012 if {![string length $dom]} {
1013 set dom [lindex $argv [incr i]]
1015 lappend domains $dom
1018 set idletime [string range $arg 2 end]
1019 if {![string length $idletime]} {
1020 set idletime [lindex $argv [incr i]]
1024 set acceptLanguage [string range $arg 2 end]
1025 if {![string length $acceptLanguage]} {
1026 set acceptLanguage [lindex $argv [incr i]]
1030 set rfile [string range $arg 2 end]
1031 if {![string length $rfile]} {
1032 set rfile [lindex $argv [incr i]]
1038 if {[RobotHref http://www.indexdata.dk/ href host path]} {
1039 if {![RobotFileExist visited $host $path]} {
1040 set outf [RobotFileOpen unvisited $host $path]
1041 RobotWriteRecord $outf href 0
1042 RobotFileClose $outf
1050 if {![info exist domains]} {
1053 if {![info exist maxdistance]} {
1056 if {![info exist robotsMax]} {
1060 puts "domains=$domains"
1061 puts "max distance=$maxdistance"
1062 puts "max jobs=$robotsMax"
1067 while {$robotsRunning} {